xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bf16.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
3; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
4; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
8; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
9
10define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
11; GCN-LABEL: test_load_store:
12; GCN:       ; %bb.0:
13; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GCN-NEXT:    s_mov_b32 s6, 0
15; GCN-NEXT:    s_mov_b32 s7, 0xf000
16; GCN-NEXT:    s_mov_b32 s4, s6
17; GCN-NEXT:    s_mov_b32 s5, s6
18; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
19; GCN-NEXT:    s_waitcnt vmcnt(0)
20; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
21; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
22; GCN-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX7-LABEL: test_load_store:
25; GFX7:       ; %bb.0:
26; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX7-NEXT:    s_mov_b32 s6, 0
28; GFX7-NEXT:    s_mov_b32 s7, 0xf000
29; GFX7-NEXT:    s_mov_b32 s4, s6
30; GFX7-NEXT:    s_mov_b32 s5, s6
31; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
32; GFX7-NEXT:    s_waitcnt vmcnt(0)
33; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
34; GFX7-NEXT:    s_waitcnt vmcnt(0)
35; GFX7-NEXT:    s_setpc_b64 s[30:31]
36;
37; GFX8-LABEL: test_load_store:
38; GFX8:       ; %bb.0:
39; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
41; GFX8-NEXT:    s_waitcnt vmcnt(0)
42; GFX8-NEXT:    flat_store_short v[2:3], v0
43; GFX8-NEXT:    s_waitcnt vmcnt(0)
44; GFX8-NEXT:    s_setpc_b64 s[30:31]
45;
46; GFX9-LABEL: test_load_store:
47; GFX9:       ; %bb.0:
48; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
50; GFX9-NEXT:    s_waitcnt vmcnt(0)
51; GFX9-NEXT:    global_store_short v[2:3], v0, off
52; GFX9-NEXT:    s_waitcnt vmcnt(0)
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54;
55; GFX10-LABEL: test_load_store:
56; GFX10:       ; %bb.0:
57; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
59; GFX10-NEXT:    s_waitcnt vmcnt(0)
60; GFX10-NEXT:    global_store_short v[2:3], v0, off
61; GFX10-NEXT:    s_setpc_b64 s[30:31]
62;
63; GFX11-LABEL: test_load_store:
64; GFX11:       ; %bb.0:
65; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
67; GFX11-NEXT:    s_waitcnt vmcnt(0)
68; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
69; GFX11-NEXT:    s_setpc_b64 s[30:31]
70  %val = load bfloat, ptr addrspace(1) %in
71  store bfloat %val, ptr addrspace(1) %out
72  ret void
73}
74
75define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
76; GCN-LABEL: v_load_global_v2bf16:
77; GCN:       ; %bb.0:
78; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; GCN-NEXT:    s_mov_b32 s6, 0
80; GCN-NEXT:    s_mov_b32 s7, 0xf000
81; GCN-NEXT:    s_mov_b32 s4, s6
82; GCN-NEXT:    s_mov_b32 s5, s6
83; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
84; GCN-NEXT:    s_waitcnt vmcnt(0)
85; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
86; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
87; GCN-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX7-LABEL: v_load_global_v2bf16:
90; GFX7:       ; %bb.0:
91; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX7-NEXT:    s_mov_b32 s6, 0
93; GFX7-NEXT:    s_mov_b32 s7, 0xf000
94; GFX7-NEXT:    s_mov_b32 s4, s6
95; GFX7-NEXT:    s_mov_b32 s5, s6
96; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
97; GFX7-NEXT:    s_waitcnt vmcnt(0)
98; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
99; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
100; GFX7-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX8-LABEL: v_load_global_v2bf16:
103; GFX8:       ; %bb.0:
104; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX8-NEXT:    flat_load_dword v0, v[0:1]
106; GFX8-NEXT:    s_waitcnt vmcnt(0)
107; GFX8-NEXT:    s_setpc_b64 s[30:31]
108;
109; GFX9-LABEL: v_load_global_v2bf16:
110; GFX9:       ; %bb.0:
111; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX9-NEXT:    global_load_dword v0, v[0:1], off
113; GFX9-NEXT:    s_waitcnt vmcnt(0)
114; GFX9-NEXT:    s_setpc_b64 s[30:31]
115;
116; GFX10-LABEL: v_load_global_v2bf16:
117; GFX10:       ; %bb.0:
118; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GFX10-NEXT:    global_load_dword v0, v[0:1], off
120; GFX10-NEXT:    s_waitcnt vmcnt(0)
121; GFX10-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX11-LABEL: v_load_global_v2bf16:
124; GFX11:       ; %bb.0:
125; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
127; GFX11-NEXT:    s_waitcnt vmcnt(0)
128; GFX11-NEXT:    s_setpc_b64 s[30:31]
129  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
130  ret <2 x bfloat> %load
131}
132
133define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
134; GCN-LABEL: v_load_global_v3bf16:
135; GCN:       ; %bb.0:
136; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GCN-NEXT:    s_mov_b32 s6, 0
138; GCN-NEXT:    s_mov_b32 s7, 0xf000
139; GCN-NEXT:    s_mov_b32 s4, s6
140; GCN-NEXT:    s_mov_b32 s5, s6
141; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
142; GCN-NEXT:    s_waitcnt vmcnt(0)
143; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
144; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
145; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
146; GCN-NEXT:    s_setpc_b64 s[30:31]
147;
148; GFX7-LABEL: v_load_global_v3bf16:
149; GFX7:       ; %bb.0:
150; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX7-NEXT:    s_mov_b32 s6, 0
152; GFX7-NEXT:    s_mov_b32 s7, 0xf000
153; GFX7-NEXT:    s_mov_b32 s4, s6
154; GFX7-NEXT:    s_mov_b32 s5, s6
155; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
156; GFX7-NEXT:    s_waitcnt vmcnt(0)
157; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
158; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
159; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
160; GFX7-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX8-LABEL: v_load_global_v3bf16:
163; GFX8:       ; %bb.0:
164; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
166; GFX8-NEXT:    s_waitcnt vmcnt(0)
167; GFX8-NEXT:    s_setpc_b64 s[30:31]
168;
169; GFX9-LABEL: v_load_global_v3bf16:
170; GFX9:       ; %bb.0:
171; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
173; GFX9-NEXT:    s_waitcnt vmcnt(0)
174; GFX9-NEXT:    s_setpc_b64 s[30:31]
175;
176; GFX10-LABEL: v_load_global_v3bf16:
177; GFX10:       ; %bb.0:
178; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
180; GFX10-NEXT:    s_waitcnt vmcnt(0)
181; GFX10-NEXT:    s_setpc_b64 s[30:31]
182;
183; GFX11-LABEL: v_load_global_v3bf16:
184; GFX11:       ; %bb.0:
185; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
187; GFX11-NEXT:    s_waitcnt vmcnt(0)
188; GFX11-NEXT:    s_setpc_b64 s[30:31]
189  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
190  ret <3 x bfloat> %load
191}
192
193define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
194; GCN-LABEL: v_load_global_v4bf16:
195; GCN:       ; %bb.0:
196; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197; GCN-NEXT:    s_mov_b32 s6, 0
198; GCN-NEXT:    s_mov_b32 s7, 0xf000
199; GCN-NEXT:    s_mov_b32 s4, s6
200; GCN-NEXT:    s_mov_b32 s5, s6
201; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
202; GCN-NEXT:    s_waitcnt vmcnt(0)
203; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
204; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
205; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
206; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
207; GCN-NEXT:    s_setpc_b64 s[30:31]
208;
209; GFX7-LABEL: v_load_global_v4bf16:
210; GFX7:       ; %bb.0:
211; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212; GFX7-NEXT:    s_mov_b32 s6, 0
213; GFX7-NEXT:    s_mov_b32 s7, 0xf000
214; GFX7-NEXT:    s_mov_b32 s4, s6
215; GFX7-NEXT:    s_mov_b32 s5, s6
216; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
217; GFX7-NEXT:    s_waitcnt vmcnt(0)
218; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
219; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
220; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
221; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
222; GFX7-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX8-LABEL: v_load_global_v4bf16:
225; GFX8:       ; %bb.0:
226; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
228; GFX8-NEXT:    s_waitcnt vmcnt(0)
229; GFX8-NEXT:    s_setpc_b64 s[30:31]
230;
231; GFX9-LABEL: v_load_global_v4bf16:
232; GFX9:       ; %bb.0:
233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
235; GFX9-NEXT:    s_waitcnt vmcnt(0)
236; GFX9-NEXT:    s_setpc_b64 s[30:31]
237;
238; GFX10-LABEL: v_load_global_v4bf16:
239; GFX10:       ; %bb.0:
240; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
242; GFX10-NEXT:    s_waitcnt vmcnt(0)
243; GFX10-NEXT:    s_setpc_b64 s[30:31]
244;
245; GFX11-LABEL: v_load_global_v4bf16:
246; GFX11:       ; %bb.0:
247; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
249; GFX11-NEXT:    s_waitcnt vmcnt(0)
250; GFX11-NEXT:    s_setpc_b64 s[30:31]
251  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
252  ret <4 x bfloat> %load
253}
254
255define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
256; GCN-LABEL: v_load_global_v6bf16:
257; GCN:       ; %bb.0:
258; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GCN-NEXT:    s_mov_b32 s6, 0
260; GCN-NEXT:    s_mov_b32 s7, 0xf000
261; GCN-NEXT:    s_mov_b32 s4, s6
262; GCN-NEXT:    s_mov_b32 s5, s6
263; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
264; GCN-NEXT:    s_waitcnt vmcnt(0)
265; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
266; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
267; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
268; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
269; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
270; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
271; GCN-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX7-LABEL: v_load_global_v6bf16:
274; GFX7:       ; %bb.0:
275; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX7-NEXT:    s_mov_b32 s6, 0
277; GFX7-NEXT:    s_mov_b32 s7, 0xf000
278; GFX7-NEXT:    s_mov_b32 s4, s6
279; GFX7-NEXT:    s_mov_b32 s5, s6
280; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
281; GFX7-NEXT:    s_waitcnt vmcnt(0)
282; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
283; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
284; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
285; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
286; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
287; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
288; GFX7-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX8-LABEL: v_load_global_v6bf16:
291; GFX8:       ; %bb.0:
292; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
294; GFX8-NEXT:    s_waitcnt vmcnt(0)
295; GFX8-NEXT:    s_setpc_b64 s[30:31]
296;
297; GFX9-LABEL: v_load_global_v6bf16:
298; GFX9:       ; %bb.0:
299; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
301; GFX9-NEXT:    s_waitcnt vmcnt(0)
302; GFX9-NEXT:    s_setpc_b64 s[30:31]
303;
304; GFX10-LABEL: v_load_global_v6bf16:
305; GFX10:       ; %bb.0:
306; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
308; GFX10-NEXT:    s_waitcnt vmcnt(0)
309; GFX10-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX11-LABEL: v_load_global_v6bf16:
312; GFX11:       ; %bb.0:
313; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
315; GFX11-NEXT:    s_waitcnt vmcnt(0)
316; GFX11-NEXT:    s_setpc_b64 s[30:31]
317  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
318  ret <6 x bfloat> %load
319}
320
321define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
322; GCN-LABEL: v_load_global_v8bf16:
323; GCN:       ; %bb.0:
324; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GCN-NEXT:    s_mov_b32 s6, 0
326; GCN-NEXT:    s_mov_b32 s7, 0xf000
327; GCN-NEXT:    s_mov_b32 s4, s6
328; GCN-NEXT:    s_mov_b32 s5, s6
329; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
330; GCN-NEXT:    s_waitcnt vmcnt(0)
331; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
332; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
333; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
334; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
335; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
336; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
337; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
338; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
339; GCN-NEXT:    s_setpc_b64 s[30:31]
340;
341; GFX7-LABEL: v_load_global_v8bf16:
342; GFX7:       ; %bb.0:
343; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX7-NEXT:    s_mov_b32 s6, 0
345; GFX7-NEXT:    s_mov_b32 s7, 0xf000
346; GFX7-NEXT:    s_mov_b32 s4, s6
347; GFX7-NEXT:    s_mov_b32 s5, s6
348; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
349; GFX7-NEXT:    s_waitcnt vmcnt(0)
350; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
351; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
352; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
353; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
354; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
355; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
356; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
357; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
358; GFX7-NEXT:    s_setpc_b64 s[30:31]
359;
360; GFX8-LABEL: v_load_global_v8bf16:
361; GFX8:       ; %bb.0:
362; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
364; GFX8-NEXT:    s_waitcnt vmcnt(0)
365; GFX8-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX9-LABEL: v_load_global_v8bf16:
368; GFX9:       ; %bb.0:
369; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
371; GFX9-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-NEXT:    s_setpc_b64 s[30:31]
373;
374; GFX10-LABEL: v_load_global_v8bf16:
375; GFX10:       ; %bb.0:
376; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
378; GFX10-NEXT:    s_waitcnt vmcnt(0)
379; GFX10-NEXT:    s_setpc_b64 s[30:31]
380;
381; GFX11-LABEL: v_load_global_v8bf16:
382; GFX11:       ; %bb.0:
383; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
385; GFX11-NEXT:    s_waitcnt vmcnt(0)
386; GFX11-NEXT:    s_setpc_b64 s[30:31]
387  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
388  ret <8 x bfloat> %load
389}
390
391define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
392; GCN-LABEL: v_load_global_v16bf16:
393; GCN:       ; %bb.0:
394; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GCN-NEXT:    s_mov_b32 s6, 0
396; GCN-NEXT:    s_mov_b32 s7, 0xf000
397; GCN-NEXT:    s_mov_b32 s4, s6
398; GCN-NEXT:    s_mov_b32 s5, s6
399; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
400; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
401; GCN-NEXT:    s_waitcnt vmcnt(1)
402; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
403; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
404; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
405; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
406; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
407; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
408; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
409; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
410; GCN-NEXT:    s_waitcnt vmcnt(0)
411; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
412; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
413; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
414; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
415; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
416; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
417; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
418; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
419; GCN-NEXT:    s_setpc_b64 s[30:31]
420;
421; GFX7-LABEL: v_load_global_v16bf16:
422; GFX7:       ; %bb.0:
423; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424; GFX7-NEXT:    s_mov_b32 s6, 0
425; GFX7-NEXT:    s_mov_b32 s7, 0xf000
426; GFX7-NEXT:    s_mov_b32 s4, s6
427; GFX7-NEXT:    s_mov_b32 s5, s6
428; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
429; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
430; GFX7-NEXT:    s_waitcnt vmcnt(1)
431; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
432; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
433; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
434; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
435; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
436; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
437; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
438; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
439; GFX7-NEXT:    s_waitcnt vmcnt(0)
440; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
441; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
442; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
443; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
444; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
445; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
446; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
447; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
448; GFX7-NEXT:    s_setpc_b64 s[30:31]
449;
450; GFX8-LABEL: v_load_global_v16bf16:
451; GFX8:       ; %bb.0:
452; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453; GFX8-NEXT:    v_mov_b32_e32 v5, v1
454; GFX8-NEXT:    v_mov_b32_e32 v4, v0
455; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
456; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
457; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
458; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
459; GFX8-NEXT:    s_waitcnt vmcnt(0)
460; GFX8-NEXT:    s_setpc_b64 s[30:31]
461;
462; GFX9-LABEL: v_load_global_v16bf16:
463; GFX9:       ; %bb.0:
464; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX9-NEXT:    v_mov_b32_e32 v9, v1
466; GFX9-NEXT:    v_mov_b32_e32 v8, v0
467; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
468; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
469; GFX9-NEXT:    s_waitcnt vmcnt(0)
470; GFX9-NEXT:    s_setpc_b64 s[30:31]
471;
472; GFX10-LABEL: v_load_global_v16bf16:
473; GFX10:       ; %bb.0:
474; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; GFX10-NEXT:    v_mov_b32_e32 v9, v1
476; GFX10-NEXT:    v_mov_b32_e32 v8, v0
477; GFX10-NEXT:    s_clause 0x1
478; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
479; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
480; GFX10-NEXT:    s_waitcnt vmcnt(0)
481; GFX10-NEXT:    s_setpc_b64 s[30:31]
482;
483; GFX11-LABEL: v_load_global_v16bf16:
484; GFX11:       ; %bb.0:
485; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
487; GFX11-NEXT:    s_clause 0x1
488; GFX11-NEXT:    global_load_b128 v[0:3], v[4:5], off
489; GFX11-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
490; GFX11-NEXT:    s_waitcnt vmcnt(0)
491; GFX11-NEXT:    s_setpc_b64 s[30:31]
492  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
493  ret <16 x bfloat> %load
494}
495
496define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
497; GCN-LABEL: v_load_global_v32bf16:
498; GCN:       ; %bb.0:
499; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500; GCN-NEXT:    s_mov_b32 s6, 0
501; GCN-NEXT:    s_mov_b32 s7, 0xf000
502; GCN-NEXT:    s_mov_b32 s4, s6
503; GCN-NEXT:    s_mov_b32 s5, s6
504; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
505; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
506; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
507; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
508; GCN-NEXT:    s_waitcnt vmcnt(3)
509; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
510; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
511; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
512; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
513; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
514; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
515; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
516; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
517; GCN-NEXT:    s_waitcnt vmcnt(2)
518; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
519; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
520; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
521; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
522; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
523; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
524; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
525; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
526; GCN-NEXT:    s_waitcnt vmcnt(1)
527; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
528; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
529; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
530; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
531; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
532; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
533; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
534; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
535; GCN-NEXT:    s_waitcnt vmcnt(0)
536; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
537; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
538; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
539; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
540; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
541; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
542; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
543; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
544; GCN-NEXT:    s_setpc_b64 s[30:31]
545;
546; GFX7-LABEL: v_load_global_v32bf16:
547; GFX7:       ; %bb.0:
548; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549; GFX7-NEXT:    s_mov_b32 s6, 0
550; GFX7-NEXT:    s_mov_b32 s7, 0xf000
551; GFX7-NEXT:    s_mov_b32 s4, s6
552; GFX7-NEXT:    s_mov_b32 s5, s6
553; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
554; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
555; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
556; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
557; GFX7-NEXT:    s_waitcnt vmcnt(3)
558; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
559; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
560; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
561; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
562; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
563; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
564; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
565; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
566; GFX7-NEXT:    s_waitcnt vmcnt(2)
567; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
568; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
569; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
570; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
571; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
572; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
573; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
574; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
575; GFX7-NEXT:    s_waitcnt vmcnt(1)
576; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
577; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
578; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
579; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
580; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
581; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
582; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
583; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
584; GFX7-NEXT:    s_waitcnt vmcnt(0)
585; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
586; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
587; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
588; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
589; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
590; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
591; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
592; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
593; GFX7-NEXT:    s_setpc_b64 s[30:31]
594;
595; GFX8-LABEL: v_load_global_v32bf16:
596; GFX8:       ; %bb.0:
597; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX8-NEXT:    v_mov_b32_e32 v12, v0
599; GFX8-NEXT:    v_mov_b32_e32 v13, v1
600; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v12
601; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v13, vcc
602; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v12
603; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
604; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[12:13]
605; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v12
606; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
607; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
608; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
609; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
610; GFX8-NEXT:    s_waitcnt vmcnt(0)
611; GFX8-NEXT:    s_setpc_b64 s[30:31]
612;
613; GFX9-LABEL: v_load_global_v32bf16:
614; GFX9:       ; %bb.0:
615; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616; GFX9-NEXT:    v_mov_b32_e32 v17, v1
617; GFX9-NEXT:    v_mov_b32_e32 v16, v0
618; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
619; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
620; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
621; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
622; GFX9-NEXT:    s_waitcnt vmcnt(0)
623; GFX9-NEXT:    s_setpc_b64 s[30:31]
624;
625; GFX10-LABEL: v_load_global_v32bf16:
626; GFX10:       ; %bb.0:
627; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX10-NEXT:    v_mov_b32_e32 v17, v1
629; GFX10-NEXT:    v_mov_b32_e32 v16, v0
630; GFX10-NEXT:    s_clause 0x3
631; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
632; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
633; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
634; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
635; GFX10-NEXT:    s_waitcnt vmcnt(0)
636; GFX10-NEXT:    s_setpc_b64 s[30:31]
637;
638; GFX11-LABEL: v_load_global_v32bf16:
639; GFX11:       ; %bb.0:
640; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
642; GFX11-NEXT:    s_clause 0x3
643; GFX11-NEXT:    global_load_b128 v[0:3], v[12:13], off
644; GFX11-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
645; GFX11-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
646; GFX11-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
647; GFX11-NEXT:    s_waitcnt vmcnt(0)
648; GFX11-NEXT:    s_setpc_b64 s[30:31]
649  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
650  ret <32 x bfloat> %load
651}
652
653define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
654; GCN-LABEL: v_load_global_v64bf16:
655; GCN:       ; %bb.0:
656; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657; GCN-NEXT:    s_mov_b32 s7, 0xf000
658; GCN-NEXT:    s_mov_b32 s6, 0
659; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
660; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
661; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
662; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
663; GCN-NEXT:    v_add_i32_e32 v11, vcc, 0x6c, v0
664; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x68, v0
665; GCN-NEXT:    s_mov_b32 s4, s6
666; GCN-NEXT:    s_mov_b32 s5, s6
667; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
668; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x64, v0
669; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x60, v0
670; GCN-NEXT:    v_add_i32_e32 v15, vcc, 0x5c, v0
671; GCN-NEXT:    s_waitcnt vmcnt(0)
672; GCN-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
673; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
674; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
675; GCN-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
676; GCN-NEXT:    s_waitcnt expcnt(0)
677; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
678; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
679; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x54, v0
680; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x50, v0
681; GCN-NEXT:    s_waitcnt vmcnt(0)
682; GCN-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
683; GCN-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
684; GCN-NEXT:    buffer_store_dword v4, v13, s[0:3], 0 offen
685; GCN-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
686; GCN-NEXT:    s_waitcnt expcnt(0)
687; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
688; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x4c, v0
689; GCN-NEXT:    v_add_i32_e32 v11, vcc, 0x48, v0
690; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x44, v0
691; GCN-NEXT:    s_waitcnt vmcnt(0)
692; GCN-NEXT:    buffer_store_dword v6, v15, s[0:3], 0 offen
693; GCN-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
694; GCN-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
695; GCN-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
696; GCN-NEXT:    s_waitcnt expcnt(0)
697; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
698; GCN-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
699; GCN-NEXT:    v_add_i32_e32 v19, vcc, 60, v0
700; GCN-NEXT:    v_add_i32_e32 v20, vcc, 56, v0
701; GCN-NEXT:    s_waitcnt vmcnt(0)
702; GCN-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
703; GCN-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
704; GCN-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
705; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
706; GCN-NEXT:    s_waitcnt expcnt(0)
707; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
708; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
709; GCN-NEXT:    v_add_i32_e32 v21, vcc, 52, v0
710; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
711; GCN-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
712; GCN-NEXT:    s_waitcnt vmcnt(2)
713; GCN-NEXT:    buffer_store_dword v10, v19, s[0:3], 0 offen
714; GCN-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
715; GCN-NEXT:    buffer_store_dword v9, v20, s[0:3], 0 offen
716; GCN-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
717; GCN-NEXT:    buffer_store_dword v8, v21, s[0:3], 0 offen
718; GCN-NEXT:    s_waitcnt expcnt(0)
719; GCN-NEXT:    v_add_i32_e32 v8, vcc, 40, v0
720; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
721; GCN-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
722; GCN-NEXT:    s_waitcnt expcnt(0)
723; GCN-NEXT:    v_add_i32_e32 v7, vcc, 32, v0
724; GCN-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
725; GCN-NEXT:    v_add_i32_e32 v10, vcc, 24, v0
726; GCN-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
727; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
728; GCN-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
729; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
730; GCN-NEXT:    s_waitcnt expcnt(0)
731; GCN-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
732; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
733; GCN-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
734; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
735; GCN-NEXT:    s_waitcnt expcnt(0)
736; GCN-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
737; GCN-NEXT:    s_waitcnt vmcnt(8)
738; GCN-NEXT:    buffer_store_dword v18, v9, s[0:3], 0 offen
739; GCN-NEXT:    buffer_store_dword v17, v10, s[0:3], 0 offen
740; GCN-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
741; GCN-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
742; GCN-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
743; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
744; GCN-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
745; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
746; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
747; GCN-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX7-LABEL: v_load_global_v64bf16:
750; GFX7:       ; %bb.0:
751; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX7-NEXT:    s_mov_b32 s6, 0
753; GFX7-NEXT:    s_mov_b32 s7, 0xf000
754; GFX7-NEXT:    s_mov_b32 s4, s6
755; GFX7-NEXT:    s_mov_b32 s5, s6
756; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
757; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
758; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
759; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
760; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
761; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 52, v0
762; GFX7-NEXT:    s_waitcnt vmcnt(0)
763; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
764; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
765; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
766; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
767; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
768; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
769; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
770; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x64, v0
771; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x60, v0
772; GFX7-NEXT:    s_waitcnt vmcnt(0)
773; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
774; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
775; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
776; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
777; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
778; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
779; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
780; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x54, v0
781; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x50, v0
782; GFX7-NEXT:    s_waitcnt vmcnt(0)
783; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
784; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
785; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
786; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
787; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
788; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
789; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
790; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
791; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
792; GFX7-NEXT:    s_waitcnt vmcnt(0)
793; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
794; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
795; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
796; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
797; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
798; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
799; GFX7-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
800; GFX7-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
801; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
802; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
803; GFX7-NEXT:    s_waitcnt vmcnt(3)
804; GFX7-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
805; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
806; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
807; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
808; GFX7-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
809; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 40, v0
810; GFX7-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
811; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
812; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
813; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
814; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
815; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
816; GFX7-NEXT:    s_waitcnt vmcnt(6)
817; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
818; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
819; GFX7-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
820; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
821; GFX7-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
822; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
823; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
824; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
825; GFX7-NEXT:    s_waitcnt vmcnt(9)
826; GFX7-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
827; GFX7-NEXT:    buffer_store_dword v13, v6, s[0:3], 0 offen
828; GFX7-NEXT:    buffer_store_dword v12, v19, s[0:3], 0 offen
829; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
830; GFX7-NEXT:    s_waitcnt vmcnt(12)
831; GFX7-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
832; GFX7-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
833; GFX7-NEXT:    buffer_store_dword v16, v3, s[0:3], 0 offen
834; GFX7-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
835; GFX7-NEXT:    s_waitcnt vmcnt(0)
836; GFX7-NEXT:    s_setpc_b64 s[30:31]
837;
838; GFX8-LABEL: v_load_global_v64bf16:
839; GFX8:       ; %bb.0:
840; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841; GFX8-NEXT:    v_mov_b32_e32 v28, v0
842; GFX8-NEXT:    v_mov_b32_e32 v29, v1
843; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v28
844; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v29, vcc
845; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v28
846; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v29, vcc
847; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v28
848; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v29, vcc
849; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 64, v28
850; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v29, vcc
851; GFX8-NEXT:    s_movk_i32 s4, 0x50
852; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v28
853; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v29, vcc
854; GFX8-NEXT:    s_movk_i32 s4, 0x60
855; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v28
856; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
857; GFX8-NEXT:    s_movk_i32 s4, 0x70
858; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
859; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
860; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
861; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
862; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
863; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
864; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
865; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
866; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
867; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
868; GFX8-NEXT:    s_waitcnt vmcnt(0)
869; GFX8-NEXT:    s_setpc_b64 s[30:31]
870;
871; GFX9-LABEL: v_load_global_v64bf16:
872; GFX9:       ; %bb.0:
873; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
874; GFX9-NEXT:    v_mov_b32_e32 v29, v1
875; GFX9-NEXT:    v_mov_b32_e32 v28, v0
876; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
877; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
878; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
879; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
880; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
881; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
882; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
883; GFX9-NEXT:    s_nop 0
884; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
885; GFX9-NEXT:    s_waitcnt vmcnt(0)
886; GFX9-NEXT:    s_setpc_b64 s[30:31]
887;
888; GFX10-LABEL: v_load_global_v64bf16:
889; GFX10:       ; %bb.0:
890; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX10-NEXT:    v_mov_b32_e32 v33, v1
892; GFX10-NEXT:    v_mov_b32_e32 v32, v0
893; GFX10-NEXT:    s_clause 0x7
894; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[32:33], off
895; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[32:33], off offset:16
896; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[32:33], off offset:32
897; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[32:33], off offset:48
898; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[32:33], off offset:64
899; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[32:33], off offset:80
900; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[32:33], off offset:96
901; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[32:33], off offset:112
902; GFX10-NEXT:    s_waitcnt vmcnt(0)
903; GFX10-NEXT:    s_setpc_b64 s[30:31]
904;
905; GFX11-LABEL: v_load_global_v64bf16:
906; GFX11:       ; %bb.0:
907; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
908; GFX11-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
909; GFX11-NEXT:    s_clause 0x7
910; GFX11-NEXT:    global_load_b128 v[0:3], v[28:29], off
911; GFX11-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
912; GFX11-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
913; GFX11-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
914; GFX11-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
915; GFX11-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
916; GFX11-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
917; GFX11-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
918; GFX11-NEXT:    s_waitcnt vmcnt(0)
919; GFX11-NEXT:    s_setpc_b64 s[30:31]
920  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
921  ret <64 x bfloat> %load
922}
923
924define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
925; GCN-LABEL: v_store_global_v2bf16:
926; GCN:       ; %bb.0:
927; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
929; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
930; GCN-NEXT:    s_mov_b32 s6, 0
931; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
932; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
933; GCN-NEXT:    s_mov_b32 s7, 0xf000
934; GCN-NEXT:    s_mov_b32 s4, s6
935; GCN-NEXT:    s_mov_b32 s5, s6
936; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
937; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
938; GCN-NEXT:    s_setpc_b64 s[30:31]
939;
940; GFX7-LABEL: v_store_global_v2bf16:
941; GFX7:       ; %bb.0:
942; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
944; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
945; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
946; GFX7-NEXT:    s_mov_b32 s6, 0
947; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
948; GFX7-NEXT:    s_mov_b32 s7, 0xf000
949; GFX7-NEXT:    s_mov_b32 s4, s6
950; GFX7-NEXT:    s_mov_b32 s5, s6
951; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
952; GFX7-NEXT:    s_waitcnt vmcnt(0)
953; GFX7-NEXT:    s_setpc_b64 s[30:31]
954;
955; GFX8-LABEL: v_store_global_v2bf16:
956; GFX8:       ; %bb.0:
957; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
958; GFX8-NEXT:    flat_store_dword v[1:2], v0
959; GFX8-NEXT:    s_waitcnt vmcnt(0)
960; GFX8-NEXT:    s_setpc_b64 s[30:31]
961;
962; GFX9-LABEL: v_store_global_v2bf16:
963; GFX9:       ; %bb.0:
964; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
965; GFX9-NEXT:    global_store_dword v[1:2], v0, off
966; GFX9-NEXT:    s_waitcnt vmcnt(0)
967; GFX9-NEXT:    s_setpc_b64 s[30:31]
968;
969; GFX10-LABEL: v_store_global_v2bf16:
970; GFX10:       ; %bb.0:
971; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972; GFX10-NEXT:    global_store_dword v[1:2], v0, off
973; GFX10-NEXT:    s_setpc_b64 s[30:31]
974;
975; GFX11-LABEL: v_store_global_v2bf16:
976; GFX11:       ; %bb.0:
977; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
979; GFX11-NEXT:    s_setpc_b64 s[30:31]
980  store <2 x bfloat> %val, ptr addrspace(1) %ptr
981  ret void
982}
983
984define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
985; GCN-LABEL: v_store_global_v3bf16:
986; GCN:       ; %bb.0:
987; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
989; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
990; GCN-NEXT:    s_mov_b32 s7, 0xf000
991; GCN-NEXT:    s_mov_b32 s6, 0
992; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
993; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
994; GCN-NEXT:    s_mov_b32 s4, s6
995; GCN-NEXT:    s_mov_b32 s5, s6
996; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
997; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
998; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
999; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
1000; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1001; GCN-NEXT:    s_setpc_b64 s[30:31]
1002;
1003; GFX7-LABEL: v_store_global_v3bf16:
1004; GFX7:       ; %bb.0:
1005; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1007; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1008; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1009; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1010; GFX7-NEXT:    s_mov_b32 s6, 0
1011; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
1012; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1013; GFX7-NEXT:    s_mov_b32 s4, s6
1014; GFX7-NEXT:    s_mov_b32 s5, s6
1015; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1016; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
1017; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
1018; GFX7-NEXT:    s_waitcnt vmcnt(0)
1019; GFX7-NEXT:    s_setpc_b64 s[30:31]
1020;
1021; GFX8-LABEL: v_store_global_v3bf16:
1022; GFX8:       ; %bb.0:
1023; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024; GFX8-NEXT:    flat_store_dword v[2:3], v0
1025; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
1026; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1027; GFX8-NEXT:    flat_store_short v[2:3], v1
1028; GFX8-NEXT:    s_waitcnt vmcnt(0)
1029; GFX8-NEXT:    s_setpc_b64 s[30:31]
1030;
1031; GFX9-LABEL: v_store_global_v3bf16:
1032; GFX9:       ; %bb.0:
1033; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
1035; GFX9-NEXT:    global_store_dword v[2:3], v0, off
1036; GFX9-NEXT:    s_waitcnt vmcnt(0)
1037; GFX9-NEXT:    s_setpc_b64 s[30:31]
1038;
1039; GFX10-LABEL: v_store_global_v3bf16:
1040; GFX10:       ; %bb.0:
1041; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1042; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
1043; GFX10-NEXT:    global_store_dword v[2:3], v0, off
1044; GFX10-NEXT:    s_setpc_b64 s[30:31]
1045;
1046; GFX11-LABEL: v_store_global_v3bf16:
1047; GFX11:       ; %bb.0:
1048; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049; GFX11-NEXT:    s_clause 0x1
1050; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
1051; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
1052; GFX11-NEXT:    s_setpc_b64 s[30:31]
1053  store <3 x bfloat> %val, ptr addrspace(1) %ptr
1054  ret void
1055}
1056
1057define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
1058; GCN-LABEL: v_store_global_v4bf16:
1059; GCN:       ; %bb.0:
1060; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1061; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1062; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1063; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1064; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1065; GCN-NEXT:    s_mov_b32 s6, 0
1066; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1067; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
1068; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 16
1069; GCN-NEXT:    v_alignbit_b32 v0, v6, v0, 16
1070; GCN-NEXT:    s_mov_b32 s7, 0xf000
1071; GCN-NEXT:    s_mov_b32 s4, s6
1072; GCN-NEXT:    s_mov_b32 s5, s6
1073; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
1074; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1075; GCN-NEXT:    s_setpc_b64 s[30:31]
1076;
1077; GFX7-LABEL: v_store_global_v4bf16:
1078; GFX7:       ; %bb.0:
1079; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1081; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1082; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1083; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1084; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1085; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1086; GFX7-NEXT:    s_mov_b32 s6, 0
1087; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
1088; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
1089; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1090; GFX7-NEXT:    s_mov_b32 s4, s6
1091; GFX7-NEXT:    s_mov_b32 s5, s6
1092; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
1093; GFX7-NEXT:    s_waitcnt vmcnt(0)
1094; GFX7-NEXT:    s_setpc_b64 s[30:31]
1095;
1096; GFX8-LABEL: v_store_global_v4bf16:
1097; GFX8:       ; %bb.0:
1098; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1100; GFX8-NEXT:    s_waitcnt vmcnt(0)
1101; GFX8-NEXT:    s_setpc_b64 s[30:31]
1102;
1103; GFX9-LABEL: v_store_global_v4bf16:
1104; GFX9:       ; %bb.0:
1105; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1107; GFX9-NEXT:    s_waitcnt vmcnt(0)
1108; GFX9-NEXT:    s_setpc_b64 s[30:31]
1109;
1110; GFX10-LABEL: v_store_global_v4bf16:
1111; GFX10:       ; %bb.0:
1112; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1113; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1114; GFX10-NEXT:    s_setpc_b64 s[30:31]
1115;
1116; GFX11-LABEL: v_store_global_v4bf16:
1117; GFX11:       ; %bb.0:
1118; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1119; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1120; GFX11-NEXT:    s_setpc_b64 s[30:31]
1121  store <4 x bfloat> %val, ptr addrspace(1) %ptr
1122  ret void
1123}
1124
1125define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
1126; GCN-LABEL: v_store_global_v8bf16:
1127; GCN:       ; %bb.0:
1128; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1129; GCN-NEXT:    s_mov_b32 s7, 0xf000
1130; GCN-NEXT:    s_mov_b32 s6, 0
1131; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1132; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1133; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1134; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1135; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1136; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v2
1137; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1138; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1139; GCN-NEXT:    s_mov_b32 s4, s6
1140; GCN-NEXT:    s_mov_b32 s5, s6
1141; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
1142; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1143; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
1144; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
1145; GCN-NEXT:    v_alignbit_b32 v3, v2, v6, 16
1146; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
1147; GCN-NEXT:    v_alignbit_b32 v1, v7, v10, 16
1148; GCN-NEXT:    v_alignbit_b32 v0, v11, v0, 16
1149; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1150; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1151; GCN-NEXT:    s_setpc_b64 s[30:31]
1152;
1153; GFX7-LABEL: v_store_global_v8bf16:
1154; GFX7:       ; %bb.0:
1155; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1156; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1157; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1158; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1159; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1160; GFX7-NEXT:    s_mov_b32 s6, 0
1161; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1162; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1163; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1164; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1165; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1166; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1167; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1168; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1169; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1170; GFX7-NEXT:    s_mov_b32 s4, s6
1171; GFX7-NEXT:    s_mov_b32 s5, s6
1172; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
1173; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
1174; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
1175; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
1176; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
1177; GFX7-NEXT:    s_waitcnt vmcnt(0)
1178; GFX7-NEXT:    s_setpc_b64 s[30:31]
1179;
1180; GFX8-LABEL: v_store_global_v8bf16:
1181; GFX8:       ; %bb.0:
1182; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1183; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1184; GFX8-NEXT:    s_waitcnt vmcnt(0)
1185; GFX8-NEXT:    s_setpc_b64 s[30:31]
1186;
1187; GFX9-LABEL: v_store_global_v8bf16:
1188; GFX9:       ; %bb.0:
1189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1190; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1191; GFX9-NEXT:    s_waitcnt vmcnt(0)
1192; GFX9-NEXT:    s_setpc_b64 s[30:31]
1193;
1194; GFX10-LABEL: v_store_global_v8bf16:
1195; GFX10:       ; %bb.0:
1196; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1198; GFX10-NEXT:    s_setpc_b64 s[30:31]
1199;
1200; GFX11-LABEL: v_store_global_v8bf16:
1201; GFX11:       ; %bb.0:
1202; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1203; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
1204; GFX11-NEXT:    s_setpc_b64 s[30:31]
1205  store <8 x bfloat> %val, ptr addrspace(1) %ptr
1206  ret void
1207}
1208
1209define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
1210; GCN-LABEL: v_store_global_v16bf16:
1211; GCN:       ; %bb.0:
1212; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1214; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1215; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1216; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1217; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1218; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v2
1219; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1220; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1221; GCN-NEXT:    s_mov_b32 s7, 0xf000
1222; GCN-NEXT:    s_mov_b32 s6, 0
1223; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v15
1224; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
1225; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
1226; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
1227; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
1228; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
1229; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
1230; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
1231; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1232; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1233; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
1234; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
1235; GCN-NEXT:    s_mov_b32 s4, s6
1236; GCN-NEXT:    s_mov_b32 s5, s6
1237; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
1238; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
1239; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
1240; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
1241; GCN-NEXT:    v_alignbit_b32 v3, v7, v6, 16
1242; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
1243; GCN-NEXT:    v_alignbit_b32 v1, v15, v18, 16
1244; GCN-NEXT:    v_alignbit_b32 v0, v19, v0, 16
1245; GCN-NEXT:    v_alignbit_b32 v7, v20, v14, 16
1246; GCN-NEXT:    v_alignbit_b32 v6, v13, v12, 16
1247; GCN-NEXT:    v_alignbit_b32 v5, v11, v10, 16
1248; GCN-NEXT:    v_alignbit_b32 v4, v9, v8, 16
1249; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
1250; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
1251; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1252; GCN-NEXT:    s_setpc_b64 s[30:31]
1253;
1254; GFX7-LABEL: v_store_global_v16bf16:
1255; GFX7:       ; %bb.0:
1256; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1257; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1258; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1259; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1260; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1261; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1262; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1263; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1264; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1265; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1266; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
1267; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
1268; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
1269; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v15
1270; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1271; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v14
1272; GFX7-NEXT:    v_alignbit_b32 v14, v0, v1, 16
1273; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v13
1274; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1275; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v12
1276; GFX7-NEXT:    v_alignbit_b32 v13, v0, v1, 16
1277; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v11
1278; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1279; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v10
1280; GFX7-NEXT:    v_alignbit_b32 v12, v0, v1, 16
1281; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v9
1282; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1283; GFX7-NEXT:    s_mov_b32 s6, 0
1284; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1285; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v8
1286; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1287; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1288; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1289; GFX7-NEXT:    s_mov_b32 s4, s6
1290; GFX7-NEXT:    s_mov_b32 s5, s6
1291; GFX7-NEXT:    v_alignbit_b32 v11, v0, v1, 16
1292; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
1293; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
1294; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
1295; GFX7-NEXT:    s_waitcnt vmcnt(0)
1296; GFX7-NEXT:    s_setpc_b64 s[30:31]
1297;
1298; GFX8-LABEL: v_store_global_v16bf16:
1299; GFX8:       ; %bb.0:
1300; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1302; GFX8-NEXT:    s_nop 0
1303; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
1304; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
1305; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1306; GFX8-NEXT:    s_waitcnt vmcnt(0)
1307; GFX8-NEXT:    s_setpc_b64 s[30:31]
1308;
1309; GFX9-LABEL: v_store_global_v16bf16:
1310; GFX9:       ; %bb.0:
1311; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1312; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
1313; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
1314; GFX9-NEXT:    s_waitcnt vmcnt(0)
1315; GFX9-NEXT:    s_setpc_b64 s[30:31]
1316;
1317; GFX10-LABEL: v_store_global_v16bf16:
1318; GFX10:       ; %bb.0:
1319; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
1321; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
1322; GFX10-NEXT:    s_setpc_b64 s[30:31]
1323;
1324; GFX11-LABEL: v_store_global_v16bf16:
1325; GFX11:       ; %bb.0:
1326; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1327; GFX11-NEXT:    s_clause 0x1
1328; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
1329; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
1330; GFX11-NEXT:    s_setpc_b64 s[30:31]
1331  store <16 x bfloat> %val, ptr addrspace(1) %ptr
1332  ret void
1333}
1334
1335define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
1336; GCN-LABEL: v_store_global_v32bf16:
1337; GCN:       ; %bb.0:
1338; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
1340; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
1341; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
1342; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
1343; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
1344; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
1345; GCN-NEXT:    v_alignbit_b32 v21, v23, v22, 16
1346; GCN-NEXT:    v_alignbit_b32 v20, v31, v20, 16
1347; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
1348; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
1349; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
1350; GCN-NEXT:    v_alignbit_b32 v19, v19, v18, 16
1351; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
1352; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
1353; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
1354; GCN-NEXT:    v_alignbit_b32 v18, v17, v16, 16
1355; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1356; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1357; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1358; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1359; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1360; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
1361; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
1362; GCN-NEXT:    v_alignbit_b32 v4, v16, v4, 16
1363; GCN-NEXT:    s_mov_b32 s6, 0
1364; GCN-NEXT:    s_mov_b32 s7, 0xf000
1365; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1366; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1367; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v1
1368; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v0
1369; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
1370; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
1371; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
1372; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
1373; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
1374; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
1375; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
1376; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v8
1377; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v29
1378; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v28
1379; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v27
1380; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v26
1381; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
1382; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
1383; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
1384; GCN-NEXT:    v_alignbit_b32 v3, v0, v2, 16
1385; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1386; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
1387; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
1388; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v30
1389; GCN-NEXT:    s_mov_b32 s4, s6
1390; GCN-NEXT:    s_mov_b32 s5, s6
1391; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
1392; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
1393; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
1394; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
1395; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
1396; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
1397; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
1398; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
1399; GCN-NEXT:    v_alignbit_b32 v2, v2, v7, 16
1400; GCN-NEXT:    v_alignbit_b32 v9, v6, v14, 16
1401; GCN-NEXT:    v_alignbit_b32 v8, v13, v12, 16
1402; GCN-NEXT:    v_alignbit_b32 v7, v11, v10, 16
1403; GCN-NEXT:    v_alignbit_b32 v6, v15, v16, 16
1404; GCN-NEXT:    v_alignbit_b32 v12, v28, v17, 16
1405; GCN-NEXT:    v_alignbit_b32 v11, v22, v23, 16
1406; GCN-NEXT:    v_alignbit_b32 v10, v25, v24, 16
1407; GCN-NEXT:    s_waitcnt vmcnt(1)
1408; GCN-NEXT:    buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
1409; GCN-NEXT:    s_waitcnt vmcnt(1)
1410; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v26
1411; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
1412; GCN-NEXT:    s_waitcnt expcnt(0)
1413; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
1414; GCN-NEXT:    v_alignbit_b32 v13, v6, v27, 16
1415; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
1416; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1417; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1418; GCN-NEXT:    s_setpc_b64 s[30:31]
1419;
1420; GFX7-LABEL: v_store_global_v32bf16:
1421; GFX7:       ; %bb.0:
1422; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1423; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1424; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1425; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1426; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1427; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1428; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1429; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
1430; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
1431; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v14
1432; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32
1433; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
1434; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1435; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v15
1436; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
1437; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
1438; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1439; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1440; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1441; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1442; GFX7-NEXT:    v_alignbit_b32 v25, v25, v24, 16
1443; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
1444; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
1445; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v13
1446; GFX7-NEXT:    v_alignbit_b32 v13, v0, v1, 16
1447; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1448; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
1449; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v12
1450; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1451; GFX7-NEXT:    v_alignbit_b32 v12, v6, v7, 16
1452; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v11
1453; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
1454; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1455; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
1456; GFX7-NEXT:    v_alignbit_b32 v11, v7, v10, 16
1457; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
1458; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
1459; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
1460; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v30
1461; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
1462; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
1463; GFX7-NEXT:    v_alignbit_b32 v27, v29, v28, 16
1464; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
1465; GFX7-NEXT:    s_mov_b32 s6, 0
1466; GFX7-NEXT:    v_alignbit_b32 v26, v31, v26, 16
1467; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1468; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1469; GFX7-NEXT:    s_mov_b32 s4, s6
1470; GFX7-NEXT:    s_mov_b32 s5, s6
1471; GFX7-NEXT:    v_alignbit_b32 v4, v24, v4, 16
1472; GFX7-NEXT:    s_waitcnt vmcnt(2)
1473; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v14
1474; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1475; GFX7-NEXT:    v_alignbit_b32 v28, v7, v6, 16
1476; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
1477; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v8
1478; GFX7-NEXT:    v_alignbit_b32 v10, v6, v7, 16
1479; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v23
1480; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1481; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v22
1482; GFX7-NEXT:    v_alignbit_b32 v9, v6, v7, 16
1483; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v19
1484; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v21
1485; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1486; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v18
1487; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
1488; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v20
1489; GFX7-NEXT:    v_alignbit_b32 v7, v6, v7, 16
1490; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v17
1491; GFX7-NEXT:    v_alignbit_b32 v8, v8, v14, 16
1492; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1493; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v16
1494; GFX7-NEXT:    v_alignbit_b32 v6, v6, v14, 16
1495; GFX7-NEXT:    s_waitcnt vmcnt(0)
1496; GFX7-NEXT:    buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
1497; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
1498; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
1499; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1500; GFX7-NEXT:    s_waitcnt vmcnt(0)
1501; GFX7-NEXT:    s_setpc_b64 s[30:31]
1502;
1503; GFX8-LABEL: v_store_global_v32bf16:
1504; GFX8:       ; %bb.0:
1505; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1506; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1507; GFX8-NEXT:    s_nop 0
1508; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
1509; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
1510; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
1511; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
1512; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
1513; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
1514; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
1515; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
1516; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1517; GFX8-NEXT:    s_waitcnt vmcnt(0)
1518; GFX8-NEXT:    s_setpc_b64 s[30:31]
1519;
1520; GFX9-LABEL: v_store_global_v32bf16:
1521; GFX9:       ; %bb.0:
1522; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1523; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
1524; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
1525; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
1526; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
1527; GFX9-NEXT:    s_waitcnt vmcnt(0)
1528; GFX9-NEXT:    s_setpc_b64 s[30:31]
1529;
1530; GFX10-LABEL: v_store_global_v32bf16:
1531; GFX10:       ; %bb.0:
1532; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1533; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
1534; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
1535; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
1536; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
1537; GFX10-NEXT:    s_setpc_b64 s[30:31]
1538;
1539; GFX11-LABEL: v_store_global_v32bf16:
1540; GFX11:       ; %bb.0:
1541; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1542; GFX11-NEXT:    s_clause 0x3
1543; GFX11-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:48
1544; GFX11-NEXT:    global_store_b128 v[16:17], v[8:11], off offset:32
1545; GFX11-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
1546; GFX11-NEXT:    global_store_b128 v[16:17], v[0:3], off
1547; GFX11-NEXT:    s_setpc_b64 s[30:31]
1548  store <32 x bfloat> %val, ptr addrspace(1) %ptr
1549  ret void
1550}
1551
1552define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
1553; GCN-LABEL: v_store_global_v64bf16:
1554; GCN:       ; %bb.0:
1555; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1556; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
1557; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
1558; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
1559; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
1560; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
1561; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
1562; GCN-NEXT:    v_alignbit_b32 v21, v23, v22, 16
1563; GCN-NEXT:    v_alignbit_b32 v20, v31, v20, 16
1564; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
1565; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
1566; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
1567; GCN-NEXT:    v_alignbit_b32 v19, v19, v18, 16
1568; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
1569; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
1570; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
1571; GCN-NEXT:    v_alignbit_b32 v18, v17, v16, 16
1572; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
1573; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
1574; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
1575; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
1576; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
1577; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
1578; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
1579; GCN-NEXT:    v_alignbit_b32 v12, v16, v12, 16
1580; GCN-NEXT:    s_mov_b32 s6, 0
1581; GCN-NEXT:    s_mov_b32 s7, 0xf000
1582; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
1583; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
1584; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
1585; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
1586; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1587; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1588; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1589; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1590; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1591; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v2
1592; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1593; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1594; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v29
1595; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v28
1596; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
1597; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v26
1598; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
1599; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
1600; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1601; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1602; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
1603; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
1604; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
1605; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
1606; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
1607; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
1608; GCN-NEXT:    v_alignbit_b32 v3, v7, v6, 16
1609; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
1610; GCN-NEXT:    v_alignbit_b32 v1, v22, v14, 16
1611; GCN-NEXT:    v_alignbit_b32 v0, v23, v0, 16
1612; GCN-NEXT:    v_alignbit_b32 v6, v26, v15, 16
1613; GCN-NEXT:    v_alignbit_b32 v5, v16, v17, 16
1614; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:136
1615; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:132
1616; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128
1617; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
1618; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
1619; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
1620; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:112
1621; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:108
1622; GCN-NEXT:    s_mov_b32 s4, s6
1623; GCN-NEXT:    s_mov_b32 s5, s6
1624; GCN-NEXT:    s_waitcnt vmcnt(6)
1625; GCN-NEXT:    buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
1626; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
1627; GCN-NEXT:    s_waitcnt expcnt(0)
1628; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
1629; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:100
1630; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
1631; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92
1632; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:88
1633; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:84
1634; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80
1635; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:76
1636; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v25
1637; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
1638; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v30
1639; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
1640; GCN-NEXT:    v_alignbit_b32 v4, v4, v23, 16
1641; GCN-NEXT:    s_waitcnt vmcnt(14)
1642; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1643; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
1644; GCN-NEXT:    s_waitcnt vmcnt(13)
1645; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
1646; GCN-NEXT:    s_waitcnt vmcnt(12)
1647; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
1648; GCN-NEXT:    s_waitcnt vmcnt(11)
1649; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
1650; GCN-NEXT:    s_waitcnt vmcnt(10)
1651; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
1652; GCN-NEXT:    s_waitcnt vmcnt(7)
1653; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
1654; GCN-NEXT:    s_waitcnt vmcnt(6)
1655; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v11
1656; GCN-NEXT:    s_waitcnt vmcnt(5)
1657; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v12
1658; GCN-NEXT:    s_waitcnt vmcnt(4)
1659; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v13
1660; GCN-NEXT:    s_waitcnt vmcnt(3)
1661; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v18
1662; GCN-NEXT:    s_waitcnt vmcnt(2)
1663; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v19
1664; GCN-NEXT:    s_waitcnt vmcnt(1)
1665; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v20
1666; GCN-NEXT:    s_waitcnt vmcnt(0)
1667; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v21
1668; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1669; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
1670; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
1671; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
1672; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
1673; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v12
1674; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
1675; GCN-NEXT:    v_alignbit_b32 v13, v7, v14, 16
1676; GCN-NEXT:    v_alignbit_b32 v12, v15, v16, 16
1677; GCN-NEXT:    v_alignbit_b32 v11, v17, v22, 16
1678; GCN-NEXT:    v_alignbit_b32 v10, v10, v23, 16
1679; GCN-NEXT:    v_alignbit_b32 v17, v20, v25, 16
1680; GCN-NEXT:    v_alignbit_b32 v16, v21, v18, 16
1681; GCN-NEXT:    v_alignbit_b32 v15, v26, v19, 16
1682; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72
1683; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68
1684; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
1685; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:32
1686; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
1687; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
1688; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:20
1689; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:16
1690; GCN-NEXT:    s_waitcnt vmcnt(7)
1691; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1692; GCN-NEXT:    s_waitcnt vmcnt(6)
1693; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
1694; GCN-NEXT:    s_waitcnt vmcnt(5)
1695; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
1696; GCN-NEXT:    s_waitcnt vmcnt(4)
1697; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
1698; GCN-NEXT:    s_waitcnt vmcnt(3)
1699; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
1700; GCN-NEXT:    s_waitcnt vmcnt(2)
1701; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
1702; GCN-NEXT:    s_waitcnt vmcnt(1)
1703; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
1704; GCN-NEXT:    s_waitcnt vmcnt(0)
1705; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
1706; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1707; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
1708; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
1709; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
1710; GCN-NEXT:    v_alignbit_b32 v14, v7, v14, 16
1711; GCN-NEXT:    v_alignbit_b32 v7, v18, v24, 16
1712; GCN-NEXT:    v_alignbit_b32 v21, v19, v20, 16
1713; GCN-NEXT:    v_alignbit_b32 v20, v25, v22, 16
1714; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
1715; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:8
1716; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:4
1717; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:64
1718; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60
1719; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
1720; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
1721; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:48
1722; GCN-NEXT:    s_waitcnt vmcnt(7)
1723; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
1724; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
1725; GCN-NEXT:    v_alignbit_b32 v19, v19, v18, 16
1726; GCN-NEXT:    s_waitcnt vmcnt(6)
1727; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v22
1728; GCN-NEXT:    s_waitcnt vmcnt(5)
1729; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v24
1730; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
1731; GCN-NEXT:    v_alignbit_b32 v18, v18, v22, 16
1732; GCN-NEXT:    s_waitcnt vmcnt(4)
1733; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v25
1734; GCN-NEXT:    s_waitcnt vmcnt(3)
1735; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v26
1736; GCN-NEXT:    s_waitcnt vmcnt(2)
1737; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v27
1738; GCN-NEXT:    s_waitcnt vmcnt(1)
1739; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v28
1740; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
1741; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
1742; GCN-NEXT:    v_alignbit_b32 v25, v22, v23, 16
1743; GCN-NEXT:    v_alignbit_b32 v24, v24, v26, 16
1744; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:44
1745; GCN-NEXT:    s_waitcnt vmcnt(1)
1746; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v29
1747; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
1748; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:36
1749; GCN-NEXT:    s_waitcnt vmcnt(2)
1750; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
1751; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
1752; GCN-NEXT:    v_alignbit_b32 v23, v23, v22, 16
1753; GCN-NEXT:    s_waitcnt vmcnt(1)
1754; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v26
1755; GCN-NEXT:    s_waitcnt vmcnt(0)
1756; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
1757; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
1758; GCN-NEXT:    v_alignbit_b32 v22, v22, v26, 16
1759; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
1760; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
1761; GCN-NEXT:    buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
1762; GCN-NEXT:    buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
1763; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
1764; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1765; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1766; GCN-NEXT:    s_setpc_b64 s[30:31]
1767;
1768; GFX7-LABEL: v_store_global_v64bf16:
1769; GFX7:       ; %bb.0:
1770; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1771; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
1772; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
1773; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
1774; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:116
1775; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112
1776; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:108
1777; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:104
1778; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:100
1779; GFX7-NEXT:    s_mov_b32 s6, 0
1780; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1781; GFX7-NEXT:    s_mov_b32 s4, s6
1782; GFX7-NEXT:    s_mov_b32 s5, s6
1783; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
1784; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
1785; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1786; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1787; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1788; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
1789; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
1790; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
1791; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
1792; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v15
1793; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1794; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
1795; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
1796; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1797; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v14
1798; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v29
1799; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
1800; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v28
1801; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
1802; GFX7-NEXT:    s_waitcnt vmcnt(7)
1803; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
1804; GFX7-NEXT:    s_waitcnt vmcnt(6)
1805; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
1806; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
1807; GFX7-NEXT:    s_waitcnt vmcnt(5)
1808; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
1809; GFX7-NEXT:    v_alignbit_b32 v36, v31, v32, 16
1810; GFX7-NEXT:    s_waitcnt vmcnt(3)
1811; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v37
1812; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v34
1813; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1814; GFX7-NEXT:    s_waitcnt vmcnt(2)
1815; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v38
1816; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
1817; GFX7-NEXT:    v_alignbit_b32 v35, v33, v34, 16
1818; GFX7-NEXT:    v_alignbit_b32 v34, v31, v32, 16
1819; GFX7-NEXT:    s_waitcnt vmcnt(1)
1820; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v39
1821; GFX7-NEXT:    s_waitcnt vmcnt(0)
1822; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v48
1823; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
1824; GFX7-NEXT:    v_alignbit_b32 v33, v31, v32, 16
1825; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:136
1826; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
1827; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:96
1828; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:92
1829; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
1830; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
1831; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:80
1832; GFX7-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:76
1833; GFX7-NEXT:    s_waitcnt vmcnt(6)
1834; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
1835; GFX7-NEXT:    s_waitcnt vmcnt(6)
1836; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v37
1837; GFX7-NEXT:    s_waitcnt vmcnt(5)
1838; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v38
1839; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1840; GFX7-NEXT:    s_waitcnt vmcnt(4)
1841; GFX7-NEXT:    v_mul_f32_e32 v35, 1.0, v39
1842; GFX7-NEXT:    v_alignbit_b32 v36, v33, v34, 16
1843; GFX7-NEXT:    s_waitcnt vmcnt(2)
1844; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v49
1845; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v48
1846; GFX7-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
1847; GFX7-NEXT:    s_waitcnt vmcnt(1)
1848; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v50
1849; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1850; GFX7-NEXT:    v_alignbit_b32 v35, v35, v37, 16
1851; GFX7-NEXT:    v_alignbit_b32 v34, v33, v34, 16
1852; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
1853; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
1854; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:64
1855; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:60
1856; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:56
1857; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:52
1858; GFX7-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:48
1859; GFX7-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
1860; GFX7-NEXT:    s_waitcnt vmcnt(7)
1861; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
1862; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1863; GFX7-NEXT:    s_waitcnt vmcnt(6)
1864; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v37
1865; GFX7-NEXT:    v_alignbit_b32 v33, v33, v37, 16
1866; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
1867; GFX7-NEXT:    s_waitcnt vmcnt(3)
1868; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v49
1869; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v38
1870; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1871; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v39
1872; GFX7-NEXT:    v_mul_f32_e32 v35, 1.0, v48
1873; GFX7-NEXT:    v_alignbit_b32 v36, v33, v34, 16
1874; GFX7-NEXT:    s_waitcnt vmcnt(2)
1875; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v50
1876; GFX7-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
1877; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1878; GFX7-NEXT:    s_waitcnt vmcnt(1)
1879; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v51
1880; GFX7-NEXT:    v_alignbit_b32 v35, v35, v37, 16
1881; GFX7-NEXT:    v_alignbit_b32 v34, v33, v34, 16
1882; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
1883; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
1884; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
1885; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
1886; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:24
1887; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
1888; GFX7-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
1889; GFX7-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
1890; GFX7-NEXT:    s_waitcnt vmcnt(7)
1891; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
1892; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1893; GFX7-NEXT:    s_waitcnt vmcnt(6)
1894; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v37
1895; GFX7-NEXT:    v_alignbit_b32 v33, v33, v37, 16
1896; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
1897; GFX7-NEXT:    s_waitcnt vmcnt(3)
1898; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v49
1899; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v38
1900; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1901; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v39
1902; GFX7-NEXT:    v_mul_f32_e32 v35, 1.0, v48
1903; GFX7-NEXT:    v_alignbit_b32 v36, v33, v34, 16
1904; GFX7-NEXT:    s_waitcnt vmcnt(2)
1905; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v50
1906; GFX7-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
1907; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1908; GFX7-NEXT:    s_waitcnt vmcnt(1)
1909; GFX7-NEXT:    v_mul_f32_e32 v34, 1.0, v51
1910; GFX7-NEXT:    v_alignbit_b32 v35, v35, v37, 16
1911; GFX7-NEXT:    v_alignbit_b32 v34, v33, v34, 16
1912; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
1913; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
1914; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32
1915; GFX7-NEXT:    s_waitcnt vmcnt(2)
1916; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
1917; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
1918; GFX7-NEXT:    s_waitcnt vmcnt(1)
1919; GFX7-NEXT:    v_mul_f32_e32 v37, 1.0, v37
1920; GFX7-NEXT:    v_alignbit_b32 v33, v33, v37, 16
1921; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
1922; GFX7-NEXT:    s_nop 0
1923; GFX7-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
1924; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
1925; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v13
1926; GFX7-NEXT:    v_alignbit_b32 v13, v0, v1, 16
1927; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v11
1928; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1929; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v10
1930; GFX7-NEXT:    v_alignbit_b32 v11, v0, v1, 16
1931; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v9
1932; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1933; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v8
1934; GFX7-NEXT:    v_alignbit_b32 v10, v0, v1, 16
1935; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v23
1936; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1937; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v12
1938; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1939; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v22
1940; GFX7-NEXT:    v_alignbit_b32 v12, v6, v7, 16
1941; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v21
1942; GFX7-NEXT:    v_alignbit_b32 v9, v0, v1, 16
1943; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v19
1944; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1945; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v20
1946; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1947; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v18
1948; GFX7-NEXT:    v_alignbit_b32 v8, v6, v7, 16
1949; GFX7-NEXT:    v_alignbit_b32 v7, v0, v1, 16
1950; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v17
1951; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1952; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v16
1953; GFX7-NEXT:    v_alignbit_b32 v6, v0, v1, 16
1954; GFX7-NEXT:    s_waitcnt vmcnt(1)
1955; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v38
1956; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1957; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v30
1958; GFX7-NEXT:    v_alignbit_b32 v17, v0, v1, 16
1959; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v27
1960; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1961; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v26
1962; GFX7-NEXT:    v_alignbit_b32 v16, v14, v15, 16
1963; GFX7-NEXT:    v_alignbit_b32 v15, v0, v1, 16
1964; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v25
1965; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1966; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v24
1967; GFX7-NEXT:    v_alignbit_b32 v14, v0, v1, 16
1968; GFX7-NEXT:    v_alignbit_b32 v4, v33, v4, 16
1969; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
1970; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
1971; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
1972; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64
1973; GFX7-NEXT:    s_waitcnt vmcnt(0)
1974; GFX7-NEXT:    s_setpc_b64 s[30:31]
1975;
1976; GFX8-LABEL: v_store_global_v64bf16:
1977; GFX8:       ; %bb.0:
1978; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
1980; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
1981; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
1982; GFX8-NEXT:    s_movk_i32 s4, 0x70
1983; GFX8-NEXT:    s_movk_i32 s5, 0x50
1984; GFX8-NEXT:    s_waitcnt vmcnt(2)
1985; GFX8-NEXT:    v_add_u32_e32 v34, vcc, s4, v32
1986; GFX8-NEXT:    s_waitcnt vmcnt(1)
1987; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
1988; GFX8-NEXT:    s_movk_i32 s4, 0x60
1989; GFX8-NEXT:    s_waitcnt vmcnt(0)
1990; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[28:31]
1991; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
1992; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v32
1993; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
1994; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s5, v32
1995; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
1996; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v32
1997; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v33, vcc
1998; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
1999; GFX8-NEXT:    s_nop 0
2000; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 48, v32
2001; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v33, vcc
2002; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 32, v32
2003; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, 0, v33, vcc
2004; GFX8-NEXT:    v_add_u32_e32 v28, vcc, 16, v32
2005; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
2006; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
2007; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
2008; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
2009; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
2010; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[4:7]
2011; GFX8-NEXT:    s_waitcnt vmcnt(0)
2012; GFX8-NEXT:    s_setpc_b64 s[30:31]
2013;
2014; GFX9-LABEL: v_store_global_v64bf16:
2015; GFX9:       ; %bb.0:
2016; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2017; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
2018; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
2019; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
2020; GFX9-NEXT:    s_waitcnt vmcnt(0)
2021; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
2022; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
2023; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
2024; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
2025; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
2026; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
2027; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
2028; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
2029; GFX9-NEXT:    s_waitcnt vmcnt(0)
2030; GFX9-NEXT:    s_setpc_b64 s[30:31]
2031;
2032; GFX10-LABEL: v_store_global_v64bf16:
2033; GFX10:       ; %bb.0:
2034; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2035; GFX10-NEXT:    s_clause 0x2
2036; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
2037; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
2038; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
2039; GFX10-NEXT:    s_waitcnt vmcnt(0)
2040; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
2041; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
2042; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
2043; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
2044; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
2045; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
2046; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
2047; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
2048; GFX10-NEXT:    s_setpc_b64 s[30:31]
2049;
2050; GFX11-LABEL: v_store_global_v64bf16:
2051; GFX11:       ; %bb.0:
2052; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053; GFX11-NEXT:    s_clause 0x2
2054; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
2055; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
2056; GFX11-NEXT:    scratch_load_b32 v31, off, s32
2057; GFX11-NEXT:    s_waitcnt vmcnt(0)
2058; GFX11-NEXT:    s_clause 0x7
2059; GFX11-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:112
2060; GFX11-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:96
2061; GFX11-NEXT:    global_store_b128 v[32:33], v[20:23], off offset:80
2062; GFX11-NEXT:    global_store_b128 v[32:33], v[16:19], off offset:64
2063; GFX11-NEXT:    global_store_b128 v[32:33], v[12:15], off offset:48
2064; GFX11-NEXT:    global_store_b128 v[32:33], v[8:11], off offset:32
2065; GFX11-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
2066; GFX11-NEXT:    global_store_b128 v[32:33], v[0:3], off
2067; GFX11-NEXT:    s_setpc_b64 s[30:31]
2068  store <64 x bfloat> %val, ptr addrspace(1) %ptr
2069  ret void
2070}
2071
2072define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
2073; GCN-LABEL: test_store_fpimm:
2074; GCN:       ; %bb.0:
2075; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2076; GCN-NEXT:    s_mov_b32 s7, 0xf000
2077; GCN-NEXT:    s_mov_b32 s6, 0
2078; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
2079; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
2080; GCN-NEXT:    s_mov_b32 s4, s6
2081; GCN-NEXT:    s_mov_b32 s5, s6
2082; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2083; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
2084; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2085; GCN-NEXT:    s_setpc_b64 s[30:31]
2086;
2087; GFX7-LABEL: test_store_fpimm:
2088; GFX7:       ; %bb.0:
2089; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2090; GFX7-NEXT:    s_mov_b32 s6, 0
2091; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2092; GFX7-NEXT:    s_mov_b32 s4, s6
2093; GFX7-NEXT:    s_mov_b32 s5, s6
2094; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
2095; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2096; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
2097; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2098; GFX7-NEXT:    s_waitcnt vmcnt(0)
2099; GFX7-NEXT:    s_setpc_b64 s[30:31]
2100;
2101; GFX8-LABEL: test_store_fpimm:
2102; GFX8:       ; %bb.0:
2103; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
2105; GFX8-NEXT:    flat_store_short v[0:1], v4
2106; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
2107; GFX8-NEXT:    flat_store_short v[2:3], v0
2108; GFX8-NEXT:    s_waitcnt vmcnt(0)
2109; GFX8-NEXT:    s_setpc_b64 s[30:31]
2110;
2111; GFX9-LABEL: test_store_fpimm:
2112; GFX9:       ; %bb.0:
2113; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2114; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
2115; GFX9-NEXT:    global_store_short v[0:1], v4, off
2116; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
2117; GFX9-NEXT:    global_store_short v[2:3], v0, off
2118; GFX9-NEXT:    s_waitcnt vmcnt(0)
2119; GFX9-NEXT:    s_setpc_b64 s[30:31]
2120;
2121; GFX10-LABEL: test_store_fpimm:
2122; GFX10:       ; %bb.0:
2123; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2124; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
2125; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
2126; GFX10-NEXT:    global_store_short v[0:1], v4, off
2127; GFX10-NEXT:    global_store_short v[2:3], v5, off
2128; GFX10-NEXT:    s_setpc_b64 s[30:31]
2129;
2130; GFX11-LABEL: test_store_fpimm:
2131; GFX11:       ; %bb.0:
2132; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
2134; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
2135; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
2136; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
2137; GFX11-NEXT:    s_setpc_b64 s[30:31]
2138  store bfloat 1.0, ptr addrspace(1) %ptr0
2139  store bfloat 42.0, ptr addrspace(1) %ptr1
2140  ret void
2141}
2142
2143define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2144; GCN-LABEL: test_load_store_f32_to_bf16:
2145; GCN:       ; %bb.0:
2146; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2147; GCN-NEXT:    s_mov_b32 s6, 0
2148; GCN-NEXT:    s_mov_b32 s7, 0xf000
2149; GCN-NEXT:    s_mov_b32 s4, s6
2150; GCN-NEXT:    s_mov_b32 s5, s6
2151; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2152; GCN-NEXT:    s_waitcnt vmcnt(0)
2153; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2154; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2155; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2156; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2157; GCN-NEXT:    s_setpc_b64 s[30:31]
2158;
2159; GFX7-LABEL: test_load_store_f32_to_bf16:
2160; GFX7:       ; %bb.0:
2161; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2162; GFX7-NEXT:    s_mov_b32 s6, 0
2163; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2164; GFX7-NEXT:    s_mov_b32 s4, s6
2165; GFX7-NEXT:    s_mov_b32 s5, s6
2166; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2167; GFX7-NEXT:    s_waitcnt vmcnt(0)
2168; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2169; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2170; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2171; GFX7-NEXT:    s_waitcnt vmcnt(0)
2172; GFX7-NEXT:    s_setpc_b64 s[30:31]
2173;
2174; GFX8-LABEL: test_load_store_f32_to_bf16:
2175; GFX8:       ; %bb.0:
2176; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2177; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2178; GFX8-NEXT:    s_waitcnt vmcnt(0)
2179; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
2180; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
2181; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
2182; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
2183; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
2184; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
2185; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2186; GFX8-NEXT:    flat_store_short v[2:3], v0
2187; GFX8-NEXT:    s_waitcnt vmcnt(0)
2188; GFX8-NEXT:    s_setpc_b64 s[30:31]
2189;
2190; GFX9-LABEL: test_load_store_f32_to_bf16:
2191; GFX9:       ; %bb.0:
2192; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2193; GFX9-NEXT:    global_load_dword v0, v[0:1], off
2194; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
2195; GFX9-NEXT:    s_waitcnt vmcnt(0)
2196; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
2197; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
2198; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
2199; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
2200; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
2201; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
2202; GFX9-NEXT:    s_waitcnt vmcnt(0)
2203; GFX9-NEXT:    s_setpc_b64 s[30:31]
2204;
2205; GFX10-LABEL: test_load_store_f32_to_bf16:
2206; GFX10:       ; %bb.0:
2207; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2208; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2209; GFX10-NEXT:    s_waitcnt vmcnt(0)
2210; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
2211; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
2212; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
2213; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
2214; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2215; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
2216; GFX10-NEXT:    s_setpc_b64 s[30:31]
2217;
2218; GFX11-LABEL: test_load_store_f32_to_bf16:
2219; GFX11:       ; %bb.0:
2220; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2222; GFX11-NEXT:    s_waitcnt vmcnt(0)
2223; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
2224; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
2225; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
2226; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2227; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
2228; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2229; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
2230; GFX11-NEXT:    s_setpc_b64 s[30:31]
2231  %val = load float, ptr addrspace(1) %in
2232  %val.bf16 = fptrunc float %val to bfloat
2233  store bfloat %val.bf16, ptr addrspace(1) %out
2234  ret void
2235}
2236
2237define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2238; GCN-LABEL: test_load_store_f64_to_bf16:
2239; GCN:       ; %bb.0:
2240; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2241; GCN-NEXT:    s_mov_b32 s6, 0
2242; GCN-NEXT:    s_mov_b32 s7, 0xf000
2243; GCN-NEXT:    s_mov_b32 s4, s6
2244; GCN-NEXT:    s_mov_b32 s5, s6
2245; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2246; GCN-NEXT:    s_waitcnt vmcnt(0)
2247; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
2248; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2249; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2250; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2251; GCN-NEXT:    s_setpc_b64 s[30:31]
2252;
2253; GFX7-LABEL: test_load_store_f64_to_bf16:
2254; GFX7:       ; %bb.0:
2255; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2256; GFX7-NEXT:    s_mov_b32 s6, 0
2257; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2258; GFX7-NEXT:    s_mov_b32 s4, s6
2259; GFX7-NEXT:    s_mov_b32 s5, s6
2260; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2261; GFX7-NEXT:    s_waitcnt vmcnt(0)
2262; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
2263; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2264; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2265; GFX7-NEXT:    s_waitcnt vmcnt(0)
2266; GFX7-NEXT:    s_setpc_b64 s[30:31]
2267;
2268; GFX8-LABEL: test_load_store_f64_to_bf16:
2269; GFX8:       ; %bb.0:
2270; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2271; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2272; GFX8-NEXT:    s_waitcnt vmcnt(0)
2273; GFX8-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
2274; GFX8-NEXT:    v_and_b32_e32 v7, 0x80000000, v1
2275; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
2276; GFX8-NEXT:    v_and_b32_e32 v8, 1, v6
2277; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
2278; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2279; GFX8-NEXT:    v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2280; GFX8-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[4:5]
2281; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v6, v4
2282; GFX8-NEXT:    s_or_b64 vcc, s[6:7], vcc
2283; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
2284; GFX8-NEXT:    v_or_b32_e32 v5, v4, v7
2285; GFX8-NEXT:    v_bfe_u32 v4, v4, 16, 1
2286; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
2287; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
2288; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2289; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
2290; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
2291; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2292; GFX8-NEXT:    flat_store_short v[2:3], v0
2293; GFX8-NEXT:    s_waitcnt vmcnt(0)
2294; GFX8-NEXT:    s_setpc_b64 s[30:31]
2295;
2296; GFX9-LABEL: test_load_store_f64_to_bf16:
2297; GFX9:       ; %bb.0:
2298; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2299; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
2300; GFX9-NEXT:    s_brev_b32 s8, 1
2301; GFX9-NEXT:    s_movk_i32 s9, 0x7fff
2302; GFX9-NEXT:    s_waitcnt vmcnt(0)
2303; GFX9-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
2304; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
2305; GFX9-NEXT:    v_and_b32_e32 v7, 1, v6
2306; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
2307; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2308; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2309; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2310; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
2311; GFX9-NEXT:    s_or_b64 vcc, s[4:5], vcc
2312; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
2313; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2314; GFX9-NEXT:    v_and_or_b32 v5, v1, s8, v4
2315; GFX9-NEXT:    v_bfe_u32 v4, v4, 16, 1
2316; GFX9-NEXT:    v_add3_u32 v4, v4, v5, s9
2317; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
2318; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
2319; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
2320; GFX9-NEXT:    s_waitcnt vmcnt(0)
2321; GFX9-NEXT:    s_setpc_b64 s[30:31]
2322;
2323; GFX10-LABEL: test_load_store_f64_to_bf16:
2324; GFX10:       ; %bb.0:
2325; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2326; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
2327; GFX10-NEXT:    s_waitcnt vmcnt(0)
2328; GFX10-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
2329; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
2330; GFX10-NEXT:    v_and_b32_e32 v7, 1, v6
2331; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
2332; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
2333; GFX10-NEXT:    v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2334; GFX10-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s5
2335; GFX10-NEXT:    s_or_b32 vcc_lo, s4, vcc_lo
2336; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
2337; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2338; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2339; GFX10-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
2340; GFX10-NEXT:    v_bfe_u32 v4, v4, 16, 1
2341; GFX10-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
2342; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
2343; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2344; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
2345; GFX10-NEXT:    s_setpc_b64 s[30:31]
2346;
2347; GFX11-LABEL: test_load_store_f64_to_bf16:
2348; GFX11:       ; %bb.0:
2349; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2350; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
2351; GFX11-NEXT:    s_waitcnt vmcnt(0)
2352; GFX11-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
2353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2354; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
2355; GFX11-NEXT:    v_and_b32_e32 v7, 1, v6
2356; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
2357; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2358; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
2359; GFX11-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
2360; GFX11-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s1
2361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2362; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
2363; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v4
2364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2365; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2366; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2367; GFX11-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
2368; GFX11-NEXT:    v_bfe_u32 v4, v4, 16, 1
2369; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2370; GFX11-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
2371; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v5
2372; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2373; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
2374; GFX11-NEXT:    s_setpc_b64 s[30:31]
2375  %val = load double, ptr addrspace(1) %in
2376  %val.bf16 = fptrunc double %val to bfloat
2377  store bfloat %val.bf16, ptr addrspace(1) %out
2378  ret void
2379}
2380
2381define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2382; GCN-LABEL: test_load_store_bf16_to_f32:
2383; GCN:       ; %bb.0:
2384; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2385; GCN-NEXT:    s_mov_b32 s6, 0
2386; GCN-NEXT:    s_mov_b32 s7, 0xf000
2387; GCN-NEXT:    s_mov_b32 s4, s6
2388; GCN-NEXT:    s_mov_b32 s5, s6
2389; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2390; GCN-NEXT:    s_waitcnt vmcnt(0)
2391; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2392; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2393; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2394; GCN-NEXT:    s_setpc_b64 s[30:31]
2395;
2396; GFX7-LABEL: test_load_store_bf16_to_f32:
2397; GFX7:       ; %bb.0:
2398; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2399; GFX7-NEXT:    s_mov_b32 s6, 0
2400; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2401; GFX7-NEXT:    s_mov_b32 s4, s6
2402; GFX7-NEXT:    s_mov_b32 s5, s6
2403; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2404; GFX7-NEXT:    s_waitcnt vmcnt(0)
2405; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2406; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2407; GFX7-NEXT:    s_waitcnt vmcnt(0)
2408; GFX7-NEXT:    s_setpc_b64 s[30:31]
2409;
2410; GFX8-LABEL: test_load_store_bf16_to_f32:
2411; GFX8:       ; %bb.0:
2412; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2413; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2414; GFX8-NEXT:    s_waitcnt vmcnt(0)
2415; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2416; GFX8-NEXT:    flat_store_dword v[2:3], v0
2417; GFX8-NEXT:    s_waitcnt vmcnt(0)
2418; GFX8-NEXT:    s_setpc_b64 s[30:31]
2419;
2420; GFX9-LABEL: test_load_store_bf16_to_f32:
2421; GFX9:       ; %bb.0:
2422; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2423; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
2424; GFX9-NEXT:    s_waitcnt vmcnt(0)
2425; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2426; GFX9-NEXT:    global_store_dword v[2:3], v0, off
2427; GFX9-NEXT:    s_waitcnt vmcnt(0)
2428; GFX9-NEXT:    s_setpc_b64 s[30:31]
2429;
2430; GFX10-LABEL: test_load_store_bf16_to_f32:
2431; GFX10:       ; %bb.0:
2432; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2433; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
2434; GFX10-NEXT:    s_waitcnt vmcnt(0)
2435; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2436; GFX10-NEXT:    global_store_dword v[2:3], v0, off
2437; GFX10-NEXT:    s_setpc_b64 s[30:31]
2438;
2439; GFX11-LABEL: test_load_store_bf16_to_f32:
2440; GFX11:       ; %bb.0:
2441; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2442; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
2443; GFX11-NEXT:    s_waitcnt vmcnt(0)
2444; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2445; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
2446; GFX11-NEXT:    s_setpc_b64 s[30:31]
2447  %val = load bfloat, ptr addrspace(1) %in
2448  %val.f32 = fpext bfloat %val to float
2449  store float %val.f32, ptr addrspace(1) %out
2450  ret void
2451}
2452
2453define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2454; GCN-LABEL: test_load_store_bf16_to_f64:
2455; GCN:       ; %bb.0:
2456; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2457; GCN-NEXT:    s_mov_b32 s6, 0
2458; GCN-NEXT:    s_mov_b32 s7, 0xf000
2459; GCN-NEXT:    s_mov_b32 s4, s6
2460; GCN-NEXT:    s_mov_b32 s5, s6
2461; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2462; GCN-NEXT:    s_waitcnt vmcnt(0)
2463; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2464; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2465; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2466; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2467; GCN-NEXT:    s_setpc_b64 s[30:31]
2468;
2469; GFX7-LABEL: test_load_store_bf16_to_f64:
2470; GFX7:       ; %bb.0:
2471; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2472; GFX7-NEXT:    s_mov_b32 s6, 0
2473; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2474; GFX7-NEXT:    s_mov_b32 s4, s6
2475; GFX7-NEXT:    s_mov_b32 s5, s6
2476; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2477; GFX7-NEXT:    s_waitcnt vmcnt(0)
2478; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2479; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2480; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2481; GFX7-NEXT:    s_waitcnt vmcnt(0)
2482; GFX7-NEXT:    s_setpc_b64 s[30:31]
2483;
2484; GFX8-LABEL: test_load_store_bf16_to_f64:
2485; GFX8:       ; %bb.0:
2486; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2487; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2488; GFX8-NEXT:    s_waitcnt vmcnt(0)
2489; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2490; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2491; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2492; GFX8-NEXT:    s_waitcnt vmcnt(0)
2493; GFX8-NEXT:    s_setpc_b64 s[30:31]
2494;
2495; GFX9-LABEL: test_load_store_bf16_to_f64:
2496; GFX9:       ; %bb.0:
2497; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
2499; GFX9-NEXT:    s_waitcnt vmcnt(0)
2500; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2501; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2502; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
2503; GFX9-NEXT:    s_waitcnt vmcnt(0)
2504; GFX9-NEXT:    s_setpc_b64 s[30:31]
2505;
2506; GFX10-LABEL: test_load_store_bf16_to_f64:
2507; GFX10:       ; %bb.0:
2508; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2509; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
2510; GFX10-NEXT:    s_waitcnt vmcnt(0)
2511; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2512; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2513; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
2514; GFX10-NEXT:    s_setpc_b64 s[30:31]
2515;
2516; GFX11-LABEL: test_load_store_bf16_to_f64:
2517; GFX11:       ; %bb.0:
2518; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2519; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
2520; GFX11-NEXT:    s_waitcnt vmcnt(0)
2521; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2522; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2523; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
2524; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
2525; GFX11-NEXT:    s_setpc_b64 s[30:31]
2526  %val = load bfloat, ptr addrspace(1) %in
2527  %val.f64 = fpext bfloat %val to double
2528  store double %val.f64, ptr addrspace(1) %out
2529  ret void
2530}
2531
2532define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2533; GCN-LABEL: test_load_store_v2bf16:
2534; GCN:       ; %bb.0:
2535; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2536; GCN-NEXT:    s_mov_b32 s6, 0
2537; GCN-NEXT:    s_mov_b32 s7, 0xf000
2538; GCN-NEXT:    s_mov_b32 s4, s6
2539; GCN-NEXT:    s_mov_b32 s5, s6
2540; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2541; GCN-NEXT:    s_waitcnt vmcnt(0)
2542; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2543; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2544; GCN-NEXT:    s_setpc_b64 s[30:31]
2545;
2546; GFX7-LABEL: test_load_store_v2bf16:
2547; GFX7:       ; %bb.0:
2548; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2549; GFX7-NEXT:    s_mov_b32 s6, 0
2550; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2551; GFX7-NEXT:    s_mov_b32 s4, s6
2552; GFX7-NEXT:    s_mov_b32 s5, s6
2553; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2554; GFX7-NEXT:    s_waitcnt vmcnt(0)
2555; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2556; GFX7-NEXT:    s_waitcnt vmcnt(0)
2557; GFX7-NEXT:    s_setpc_b64 s[30:31]
2558;
2559; GFX8-LABEL: test_load_store_v2bf16:
2560; GFX8:       ; %bb.0:
2561; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2563; GFX8-NEXT:    s_waitcnt vmcnt(0)
2564; GFX8-NEXT:    flat_store_dword v[2:3], v0
2565; GFX8-NEXT:    s_waitcnt vmcnt(0)
2566; GFX8-NEXT:    s_setpc_b64 s[30:31]
2567;
2568; GFX9-LABEL: test_load_store_v2bf16:
2569; GFX9:       ; %bb.0:
2570; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2571; GFX9-NEXT:    global_load_dword v0, v[0:1], off
2572; GFX9-NEXT:    s_waitcnt vmcnt(0)
2573; GFX9-NEXT:    global_store_dword v[2:3], v0, off
2574; GFX9-NEXT:    s_waitcnt vmcnt(0)
2575; GFX9-NEXT:    s_setpc_b64 s[30:31]
2576;
2577; GFX10-LABEL: test_load_store_v2bf16:
2578; GFX10:       ; %bb.0:
2579; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2581; GFX10-NEXT:    s_waitcnt vmcnt(0)
2582; GFX10-NEXT:    global_store_dword v[2:3], v0, off
2583; GFX10-NEXT:    s_setpc_b64 s[30:31]
2584;
2585; GFX11-LABEL: test_load_store_v2bf16:
2586; GFX11:       ; %bb.0:
2587; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2588; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2589; GFX11-NEXT:    s_waitcnt vmcnt(0)
2590; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
2591; GFX11-NEXT:    s_setpc_b64 s[30:31]
2592  %val = load <2 x bfloat>, ptr addrspace(1) %in
2593  store <2 x bfloat> %val, ptr addrspace(1) %out
2594  ret void
2595}
2596
2597define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2598; GCN-LABEL: test_load_store_v4bf16:
2599; GCN:       ; %bb.0:
2600; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2601; GCN-NEXT:    s_mov_b32 s6, 0
2602; GCN-NEXT:    s_mov_b32 s7, 0xf000
2603; GCN-NEXT:    s_mov_b32 s4, s6
2604; GCN-NEXT:    s_mov_b32 s5, s6
2605; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2606; GCN-NEXT:    s_waitcnt vmcnt(0)
2607; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2608; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2609; GCN-NEXT:    s_setpc_b64 s[30:31]
2610;
2611; GFX7-LABEL: test_load_store_v4bf16:
2612; GFX7:       ; %bb.0:
2613; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2614; GFX7-NEXT:    s_mov_b32 s6, 0
2615; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2616; GFX7-NEXT:    s_mov_b32 s4, s6
2617; GFX7-NEXT:    s_mov_b32 s5, s6
2618; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2619; GFX7-NEXT:    s_waitcnt vmcnt(0)
2620; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2621; GFX7-NEXT:    s_waitcnt vmcnt(0)
2622; GFX7-NEXT:    s_setpc_b64 s[30:31]
2623;
2624; GFX8-LABEL: test_load_store_v4bf16:
2625; GFX8:       ; %bb.0:
2626; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2628; GFX8-NEXT:    s_waitcnt vmcnt(0)
2629; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2630; GFX8-NEXT:    s_waitcnt vmcnt(0)
2631; GFX8-NEXT:    s_setpc_b64 s[30:31]
2632;
2633; GFX9-LABEL: test_load_store_v4bf16:
2634; GFX9:       ; %bb.0:
2635; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
2637; GFX9-NEXT:    s_waitcnt vmcnt(0)
2638; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
2639; GFX9-NEXT:    s_waitcnt vmcnt(0)
2640; GFX9-NEXT:    s_setpc_b64 s[30:31]
2641;
2642; GFX10-LABEL: test_load_store_v4bf16:
2643; GFX10:       ; %bb.0:
2644; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2645; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
2646; GFX10-NEXT:    s_waitcnt vmcnt(0)
2647; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
2648; GFX10-NEXT:    s_setpc_b64 s[30:31]
2649;
2650; GFX11-LABEL: test_load_store_v4bf16:
2651; GFX11:       ; %bb.0:
2652; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2653; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
2654; GFX11-NEXT:    s_waitcnt vmcnt(0)
2655; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
2656; GFX11-NEXT:    s_setpc_b64 s[30:31]
2657  %val = load <4 x bfloat>, ptr addrspace(1) %in
2658  store <4 x bfloat> %val, ptr addrspace(1) %out
2659  ret void
2660}
2661
2662define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2663; GCN-LABEL: test_load_store_v8bf16:
2664; GCN:       ; %bb.0:
2665; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2666; GCN-NEXT:    s_mov_b32 s6, 0
2667; GCN-NEXT:    s_mov_b32 s7, 0xf000
2668; GCN-NEXT:    s_mov_b32 s4, s6
2669; GCN-NEXT:    s_mov_b32 s5, s6
2670; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2671; GCN-NEXT:    s_waitcnt vmcnt(0)
2672; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2673; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2674; GCN-NEXT:    s_setpc_b64 s[30:31]
2675;
2676; GFX7-LABEL: test_load_store_v8bf16:
2677; GFX7:       ; %bb.0:
2678; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679; GFX7-NEXT:    s_mov_b32 s6, 0
2680; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2681; GFX7-NEXT:    s_mov_b32 s4, s6
2682; GFX7-NEXT:    s_mov_b32 s5, s6
2683; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2684; GFX7-NEXT:    s_waitcnt vmcnt(0)
2685; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2686; GFX7-NEXT:    s_waitcnt vmcnt(0)
2687; GFX7-NEXT:    s_setpc_b64 s[30:31]
2688;
2689; GFX8-LABEL: test_load_store_v8bf16:
2690; GFX8:       ; %bb.0:
2691; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2692; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2693; GFX8-NEXT:    s_waitcnt vmcnt(0)
2694; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
2695; GFX8-NEXT:    s_waitcnt vmcnt(0)
2696; GFX8-NEXT:    s_setpc_b64 s[30:31]
2697;
2698; GFX9-LABEL: test_load_store_v8bf16:
2699; GFX9:       ; %bb.0:
2700; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2701; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2702; GFX9-NEXT:    s_waitcnt vmcnt(0)
2703; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
2704; GFX9-NEXT:    s_waitcnt vmcnt(0)
2705; GFX9-NEXT:    s_setpc_b64 s[30:31]
2706;
2707; GFX10-LABEL: test_load_store_v8bf16:
2708; GFX10:       ; %bb.0:
2709; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2710; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2711; GFX10-NEXT:    s_waitcnt vmcnt(0)
2712; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
2713; GFX10-NEXT:    s_setpc_b64 s[30:31]
2714;
2715; GFX11-LABEL: test_load_store_v8bf16:
2716; GFX11:       ; %bb.0:
2717; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2718; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
2719; GFX11-NEXT:    s_waitcnt vmcnt(0)
2720; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
2721; GFX11-NEXT:    s_setpc_b64 s[30:31]
2722  %val = load <8 x bfloat>, ptr addrspace(1) %in
2723  store <8 x bfloat> %val, ptr addrspace(1) %out
2724  ret void
2725}
2726
2727define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2728; GCN-LABEL: test_load_store_v16bf16:
2729; GCN:       ; %bb.0:
2730; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731; GCN-NEXT:    s_mov_b32 s6, 0
2732; GCN-NEXT:    s_mov_b32 s7, 0xf000
2733; GCN-NEXT:    s_mov_b32 s4, s6
2734; GCN-NEXT:    s_mov_b32 s5, s6
2735; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2736; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2737; GCN-NEXT:    s_waitcnt vmcnt(1)
2738; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2739; GCN-NEXT:    s_waitcnt vmcnt(1)
2740; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2741; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2742; GCN-NEXT:    s_setpc_b64 s[30:31]
2743;
2744; GFX7-LABEL: test_load_store_v16bf16:
2745; GFX7:       ; %bb.0:
2746; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2747; GFX7-NEXT:    s_mov_b32 s6, 0
2748; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2749; GFX7-NEXT:    s_mov_b32 s4, s6
2750; GFX7-NEXT:    s_mov_b32 s5, s6
2751; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2752; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2753; GFX7-NEXT:    s_waitcnt vmcnt(1)
2754; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2755; GFX7-NEXT:    s_waitcnt vmcnt(1)
2756; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2757; GFX7-NEXT:    s_waitcnt vmcnt(0)
2758; GFX7-NEXT:    s_setpc_b64 s[30:31]
2759;
2760; GFX8-LABEL: test_load_store_v16bf16:
2761; GFX8:       ; %bb.0:
2762; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2763; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
2764; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
2765; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2766; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
2767; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
2768; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
2769; GFX8-NEXT:    s_waitcnt vmcnt(1)
2770; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
2771; GFX8-NEXT:    s_waitcnt vmcnt(1)
2772; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
2773; GFX8-NEXT:    s_waitcnt vmcnt(0)
2774; GFX8-NEXT:    s_setpc_b64 s[30:31]
2775;
2776; GFX9-LABEL: test_load_store_v16bf16:
2777; GFX9:       ; %bb.0:
2778; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2779; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
2780; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
2781; GFX9-NEXT:    s_waitcnt vmcnt(1)
2782; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
2783; GFX9-NEXT:    s_waitcnt vmcnt(1)
2784; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
2785; GFX9-NEXT:    s_waitcnt vmcnt(0)
2786; GFX9-NEXT:    s_setpc_b64 s[30:31]
2787;
2788; GFX10-LABEL: test_load_store_v16bf16:
2789; GFX10:       ; %bb.0:
2790; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2791; GFX10-NEXT:    s_clause 0x1
2792; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
2793; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
2794; GFX10-NEXT:    s_waitcnt vmcnt(1)
2795; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
2796; GFX10-NEXT:    s_waitcnt vmcnt(0)
2797; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
2798; GFX10-NEXT:    s_setpc_b64 s[30:31]
2799;
2800; GFX11-LABEL: test_load_store_v16bf16:
2801; GFX11:       ; %bb.0:
2802; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803; GFX11-NEXT:    s_clause 0x1
2804; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
2805; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off
2806; GFX11-NEXT:    s_waitcnt vmcnt(1)
2807; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
2808; GFX11-NEXT:    s_waitcnt vmcnt(0)
2809; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
2810; GFX11-NEXT:    s_setpc_b64 s[30:31]
2811  %val = load <16 x bfloat>, ptr addrspace(1) %in
2812  store <16 x bfloat> %val, ptr addrspace(1) %out
2813  ret void
2814}
2815
2816define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
2817; GCN-LABEL: test_arg_store:
2818; GCN:       ; %bb.0:
2819; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2820; GCN-NEXT:    s_mov_b32 s7, 0xf000
2821; GCN-NEXT:    s_mov_b32 s6, 0
2822; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2823; GCN-NEXT:    s_mov_b32 s4, s6
2824; GCN-NEXT:    s_mov_b32 s5, s6
2825; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2826; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2827; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2828; GCN-NEXT:    s_setpc_b64 s[30:31]
2829;
2830; GFX7-LABEL: test_arg_store:
2831; GFX7:       ; %bb.0:
2832; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2833; GFX7-NEXT:    s_mov_b32 s6, 0
2834; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2835; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2836; GFX7-NEXT:    s_mov_b32 s4, s6
2837; GFX7-NEXT:    s_mov_b32 s5, s6
2838; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2839; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2840; GFX7-NEXT:    s_waitcnt vmcnt(0)
2841; GFX7-NEXT:    s_setpc_b64 s[30:31]
2842;
2843; GFX8-LABEL: test_arg_store:
2844; GFX8:       ; %bb.0:
2845; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2846; GFX8-NEXT:    flat_store_short v[1:2], v0
2847; GFX8-NEXT:    s_waitcnt vmcnt(0)
2848; GFX8-NEXT:    s_setpc_b64 s[30:31]
2849;
2850; GFX9-LABEL: test_arg_store:
2851; GFX9:       ; %bb.0:
2852; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2853; GFX9-NEXT:    global_store_short v[1:2], v0, off
2854; GFX9-NEXT:    s_waitcnt vmcnt(0)
2855; GFX9-NEXT:    s_setpc_b64 s[30:31]
2856;
2857; GFX10-LABEL: test_arg_store:
2858; GFX10:       ; %bb.0:
2859; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2860; GFX10-NEXT:    global_store_short v[1:2], v0, off
2861; GFX10-NEXT:    s_setpc_b64 s[30:31]
2862;
2863; GFX11-LABEL: test_arg_store:
2864; GFX11:       ; %bb.0:
2865; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866; GFX11-NEXT:    global_store_b16 v[1:2], v0, off
2867; GFX11-NEXT:    s_setpc_b64 s[30:31]
2868  store bfloat %in, ptr addrspace(1) %out
2869  ret void
2870}
2871
2872define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
2873; GCN-LABEL: test_arg_store_v2bf16:
2874; GCN:       ; %bb.0:
2875; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2876; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2877; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2878; GCN-NEXT:    s_mov_b32 s6, 0
2879; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2880; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2881; GCN-NEXT:    s_mov_b32 s7, 0xf000
2882; GCN-NEXT:    s_mov_b32 s4, s6
2883; GCN-NEXT:    s_mov_b32 s5, s6
2884; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2885; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2886; GCN-NEXT:    s_setpc_b64 s[30:31]
2887;
2888; GFX7-LABEL: test_arg_store_v2bf16:
2889; GFX7:       ; %bb.0:
2890; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2892; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2893; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2894; GFX7-NEXT:    s_mov_b32 s6, 0
2895; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2896; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2897; GFX7-NEXT:    s_mov_b32 s4, s6
2898; GFX7-NEXT:    s_mov_b32 s5, s6
2899; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2900; GFX7-NEXT:    s_waitcnt vmcnt(0)
2901; GFX7-NEXT:    s_setpc_b64 s[30:31]
2902;
2903; GFX8-LABEL: test_arg_store_v2bf16:
2904; GFX8:       ; %bb.0:
2905; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2906; GFX8-NEXT:    flat_store_dword v[1:2], v0
2907; GFX8-NEXT:    s_waitcnt vmcnt(0)
2908; GFX8-NEXT:    s_setpc_b64 s[30:31]
2909;
2910; GFX9-LABEL: test_arg_store_v2bf16:
2911; GFX9:       ; %bb.0:
2912; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913; GFX9-NEXT:    global_store_dword v[1:2], v0, off
2914; GFX9-NEXT:    s_waitcnt vmcnt(0)
2915; GFX9-NEXT:    s_setpc_b64 s[30:31]
2916;
2917; GFX10-LABEL: test_arg_store_v2bf16:
2918; GFX10:       ; %bb.0:
2919; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920; GFX10-NEXT:    global_store_dword v[1:2], v0, off
2921; GFX10-NEXT:    s_setpc_b64 s[30:31]
2922;
2923; GFX11-LABEL: test_arg_store_v2bf16:
2924; GFX11:       ; %bb.0:
2925; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2926; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
2927; GFX11-NEXT:    s_setpc_b64 s[30:31]
2928  store <2 x bfloat> %in, ptr addrspace(1) %out
2929  ret void
2930}
2931
2932define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
2933; GCN-LABEL: test_arg_store_v3bf16:
2934; GCN:       ; %bb.0:
2935; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2937; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2938; GCN-NEXT:    s_mov_b32 s7, 0xf000
2939; GCN-NEXT:    s_mov_b32 s6, 0
2940; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2941; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2942; GCN-NEXT:    s_mov_b32 s4, s6
2943; GCN-NEXT:    s_mov_b32 s5, s6
2944; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2945; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2946; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
2947; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2948; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2949; GCN-NEXT:    s_setpc_b64 s[30:31]
2950;
2951; GFX7-LABEL: test_arg_store_v3bf16:
2952; GFX7:       ; %bb.0:
2953; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2954; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2955; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2956; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2957; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2958; GFX7-NEXT:    s_mov_b32 s6, 0
2959; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
2960; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2961; GFX7-NEXT:    s_mov_b32 s4, s6
2962; GFX7-NEXT:    s_mov_b32 s5, s6
2963; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2964; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
2965; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2966; GFX7-NEXT:    s_waitcnt vmcnt(0)
2967; GFX7-NEXT:    s_setpc_b64 s[30:31]
2968;
2969; GFX8-LABEL: test_arg_store_v3bf16:
2970; GFX8:       ; %bb.0:
2971; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2972; GFX8-NEXT:    flat_store_dword v[2:3], v0
2973; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
2974; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2975; GFX8-NEXT:    flat_store_short v[2:3], v1
2976; GFX8-NEXT:    s_waitcnt vmcnt(0)
2977; GFX8-NEXT:    s_setpc_b64 s[30:31]
2978;
2979; GFX9-LABEL: test_arg_store_v3bf16:
2980; GFX9:       ; %bb.0:
2981; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2982; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
2983; GFX9-NEXT:    global_store_dword v[2:3], v0, off
2984; GFX9-NEXT:    s_waitcnt vmcnt(0)
2985; GFX9-NEXT:    s_setpc_b64 s[30:31]
2986;
2987; GFX10-LABEL: test_arg_store_v3bf16:
2988; GFX10:       ; %bb.0:
2989; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
2991; GFX10-NEXT:    global_store_dword v[2:3], v0, off
2992; GFX10-NEXT:    s_setpc_b64 s[30:31]
2993;
2994; GFX11-LABEL: test_arg_store_v3bf16:
2995; GFX11:       ; %bb.0:
2996; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997; GFX11-NEXT:    s_clause 0x1
2998; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
2999; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
3000; GFX11-NEXT:    s_setpc_b64 s[30:31]
3001  store <3 x bfloat> %in, ptr addrspace(1) %out
3002  ret void
3003}
3004
3005define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
3006; GCN-LABEL: test_arg_store_v4bf16:
3007; GCN:       ; %bb.0:
3008; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3009; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3010; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3011; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3012; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3013; GCN-NEXT:    s_mov_b32 s6, 0
3014; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
3015; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
3016; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 16
3017; GCN-NEXT:    v_alignbit_b32 v0, v6, v0, 16
3018; GCN-NEXT:    s_mov_b32 s7, 0xf000
3019; GCN-NEXT:    s_mov_b32 s4, s6
3020; GCN-NEXT:    s_mov_b32 s5, s6
3021; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
3022; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3023; GCN-NEXT:    s_setpc_b64 s[30:31]
3024;
3025; GFX7-LABEL: test_arg_store_v4bf16:
3026; GFX7:       ; %bb.0:
3027; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3028; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3029; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3030; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
3031; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3032; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3033; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3034; GFX7-NEXT:    s_mov_b32 s6, 0
3035; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
3036; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
3037; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3038; GFX7-NEXT:    s_mov_b32 s4, s6
3039; GFX7-NEXT:    s_mov_b32 s5, s6
3040; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
3041; GFX7-NEXT:    s_waitcnt vmcnt(0)
3042; GFX7-NEXT:    s_setpc_b64 s[30:31]
3043;
3044; GFX8-LABEL: test_arg_store_v4bf16:
3045; GFX8:       ; %bb.0:
3046; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3047; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
3048; GFX8-NEXT:    s_waitcnt vmcnt(0)
3049; GFX8-NEXT:    s_setpc_b64 s[30:31]
3050;
3051; GFX9-LABEL: test_arg_store_v4bf16:
3052; GFX9:       ; %bb.0:
3053; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3054; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
3055; GFX9-NEXT:    s_waitcnt vmcnt(0)
3056; GFX9-NEXT:    s_setpc_b64 s[30:31]
3057;
3058; GFX10-LABEL: test_arg_store_v4bf16:
3059; GFX10:       ; %bb.0:
3060; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3061; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
3062; GFX10-NEXT:    s_setpc_b64 s[30:31]
3063;
3064; GFX11-LABEL: test_arg_store_v4bf16:
3065; GFX11:       ; %bb.0:
3066; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3067; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
3068; GFX11-NEXT:    s_setpc_b64 s[30:31]
3069  store <4 x bfloat> %in, ptr addrspace(1)  %out
3070  ret void
3071}
3072
3073define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
3074; GCN-LABEL: test_arg_store_v8bf16:
3075; GCN:       ; %bb.0:
3076; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3077; GCN-NEXT:    s_mov_b32 s7, 0xf000
3078; GCN-NEXT:    s_mov_b32 s6, 0
3079; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
3080; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
3081; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
3082; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
3083; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3084; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v2
3085; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3086; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3087; GCN-NEXT:    s_mov_b32 s4, s6
3088; GCN-NEXT:    s_mov_b32 s5, s6
3089; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
3090; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
3091; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
3092; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
3093; GCN-NEXT:    v_alignbit_b32 v3, v2, v6, 16
3094; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
3095; GCN-NEXT:    v_alignbit_b32 v1, v7, v10, 16
3096; GCN-NEXT:    v_alignbit_b32 v0, v11, v0, 16
3097; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
3098; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3099; GCN-NEXT:    s_setpc_b64 s[30:31]
3100;
3101; GFX7-LABEL: test_arg_store_v8bf16:
3102; GFX7:       ; %bb.0:
3103; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3104; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
3105; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
3106; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3107; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3108; GFX7-NEXT:    s_mov_b32 s6, 0
3109; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
3110; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
3111; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
3112; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
3113; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
3114; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3115; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3116; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3117; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3118; GFX7-NEXT:    s_mov_b32 s4, s6
3119; GFX7-NEXT:    s_mov_b32 s5, s6
3120; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
3121; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
3122; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
3123; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
3124; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
3125; GFX7-NEXT:    s_waitcnt vmcnt(0)
3126; GFX7-NEXT:    s_setpc_b64 s[30:31]
3127;
3128; GFX8-LABEL: test_arg_store_v8bf16:
3129; GFX8:       ; %bb.0:
3130; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3131; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3132; GFX8-NEXT:    s_waitcnt vmcnt(0)
3133; GFX8-NEXT:    s_setpc_b64 s[30:31]
3134;
3135; GFX9-LABEL: test_arg_store_v8bf16:
3136; GFX9:       ; %bb.0:
3137; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3138; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
3139; GFX9-NEXT:    s_waitcnt vmcnt(0)
3140; GFX9-NEXT:    s_setpc_b64 s[30:31]
3141;
3142; GFX10-LABEL: test_arg_store_v8bf16:
3143; GFX10:       ; %bb.0:
3144; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3145; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
3146; GFX10-NEXT:    s_setpc_b64 s[30:31]
3147;
3148; GFX11-LABEL: test_arg_store_v8bf16:
3149; GFX11:       ; %bb.0:
3150; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3151; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
3152; GFX11-NEXT:    s_setpc_b64 s[30:31]
3153  store <8 x bfloat> %in, ptr addrspace(1) %out
3154  ret void
3155}
3156
3157define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
3158; GCN-LABEL: test_arg_store_v16bf16:
3159; GCN:       ; %bb.0:
3160; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3161; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
3162; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
3163; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
3164; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
3165; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3166; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v2
3167; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3168; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3169; GCN-NEXT:    s_mov_b32 s7, 0xf000
3170; GCN-NEXT:    s_mov_b32 s6, 0
3171; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v15
3172; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
3173; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
3174; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
3175; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
3176; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
3177; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
3178; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
3179; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
3180; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
3181; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
3182; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
3183; GCN-NEXT:    s_mov_b32 s4, s6
3184; GCN-NEXT:    s_mov_b32 s5, s6
3185; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
3186; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
3187; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
3188; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
3189; GCN-NEXT:    v_alignbit_b32 v3, v7, v6, 16
3190; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
3191; GCN-NEXT:    v_alignbit_b32 v1, v15, v18, 16
3192; GCN-NEXT:    v_alignbit_b32 v0, v19, v0, 16
3193; GCN-NEXT:    v_alignbit_b32 v7, v20, v14, 16
3194; GCN-NEXT:    v_alignbit_b32 v6, v13, v12, 16
3195; GCN-NEXT:    v_alignbit_b32 v5, v11, v10, 16
3196; GCN-NEXT:    v_alignbit_b32 v4, v9, v8, 16
3197; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
3198; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
3199; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3200; GCN-NEXT:    s_setpc_b64 s[30:31]
3201;
3202; GFX7-LABEL: test_arg_store_v16bf16:
3203; GFX7:       ; %bb.0:
3204; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3205; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
3206; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3207; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3208; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
3209; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
3210; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
3211; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3212; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3213; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3214; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
3215; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
3216; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
3217; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v15
3218; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3219; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v14
3220; GFX7-NEXT:    v_alignbit_b32 v14, v0, v1, 16
3221; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v13
3222; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3223; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v12
3224; GFX7-NEXT:    v_alignbit_b32 v13, v0, v1, 16
3225; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v11
3226; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3227; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v10
3228; GFX7-NEXT:    v_alignbit_b32 v12, v0, v1, 16
3229; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v9
3230; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
3231; GFX7-NEXT:    s_mov_b32 s6, 0
3232; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3233; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v8
3234; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
3235; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
3236; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3237; GFX7-NEXT:    s_mov_b32 s4, s6
3238; GFX7-NEXT:    s_mov_b32 s5, s6
3239; GFX7-NEXT:    v_alignbit_b32 v11, v0, v1, 16
3240; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
3241; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
3242; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
3243; GFX7-NEXT:    s_waitcnt vmcnt(0)
3244; GFX7-NEXT:    s_setpc_b64 s[30:31]
3245;
3246; GFX8-LABEL: test_arg_store_v16bf16:
3247; GFX8:       ; %bb.0:
3248; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3249; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3250; GFX8-NEXT:    s_nop 0
3251; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
3252; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
3253; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
3254; GFX8-NEXT:    s_waitcnt vmcnt(0)
3255; GFX8-NEXT:    s_setpc_b64 s[30:31]
3256;
3257; GFX9-LABEL: test_arg_store_v16bf16:
3258; GFX9:       ; %bb.0:
3259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3260; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
3261; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3262; GFX9-NEXT:    s_waitcnt vmcnt(0)
3263; GFX9-NEXT:    s_setpc_b64 s[30:31]
3264;
3265; GFX10-LABEL: test_arg_store_v16bf16:
3266; GFX10:       ; %bb.0:
3267; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3268; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
3269; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3270; GFX10-NEXT:    s_setpc_b64 s[30:31]
3271;
3272; GFX11-LABEL: test_arg_store_v16bf16:
3273; GFX11:       ; %bb.0:
3274; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275; GFX11-NEXT:    s_clause 0x1
3276; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
3277; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
3278; GFX11-NEXT:    s_setpc_b64 s[30:31]
3279  store <16 x bfloat> %in, ptr addrspace(1) %out
3280  ret void
3281}
3282
3283define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
3284; GCN-LABEL: test_inreg_arg_store:
3285; GCN:       ; %bb.0:
3286; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3287; GCN-NEXT:    s_mov_b32 s39, 0xf000
3288; GCN-NEXT:    s_mov_b32 s38, 0
3289; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s4
3290; GCN-NEXT:    s_mov_b32 s36, s38
3291; GCN-NEXT:    s_mov_b32 s37, s38
3292; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
3293; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3294; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3295; GCN-NEXT:    s_setpc_b64 s[30:31]
3296;
3297; GFX7-LABEL: test_inreg_arg_store:
3298; GFX7:       ; %bb.0:
3299; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3300; GFX7-NEXT:    s_mov_b32 s38, 0
3301; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s4
3302; GFX7-NEXT:    s_mov_b32 s39, 0xf000
3303; GFX7-NEXT:    s_mov_b32 s36, s38
3304; GFX7-NEXT:    s_mov_b32 s37, s38
3305; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
3306; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3307; GFX7-NEXT:    s_waitcnt vmcnt(0)
3308; GFX7-NEXT:    s_setpc_b64 s[30:31]
3309;
3310; GFX8-LABEL: test_inreg_arg_store:
3311; GFX8:       ; %bb.0:
3312; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3313; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3314; GFX8-NEXT:    flat_store_short v[0:1], v2
3315; GFX8-NEXT:    s_waitcnt vmcnt(0)
3316; GFX8-NEXT:    s_setpc_b64 s[30:31]
3317;
3318; GFX9-LABEL: test_inreg_arg_store:
3319; GFX9:       ; %bb.0:
3320; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3321; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3322; GFX9-NEXT:    global_store_short v[0:1], v2, off
3323; GFX9-NEXT:    s_waitcnt vmcnt(0)
3324; GFX9-NEXT:    s_setpc_b64 s[30:31]
3325;
3326; GFX10-LABEL: test_inreg_arg_store:
3327; GFX10:       ; %bb.0:
3328; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3329; GFX10-NEXT:    v_mov_b32_e32 v2, s4
3330; GFX10-NEXT:    global_store_short v[0:1], v2, off
3331; GFX10-NEXT:    s_setpc_b64 s[30:31]
3332;
3333; GFX11-LABEL: test_inreg_arg_store:
3334; GFX11:       ; %bb.0:
3335; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3336; GFX11-NEXT:    v_mov_b32_e32 v2, s4
3337; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
3338; GFX11-NEXT:    s_setpc_b64 s[30:31]
3339  store bfloat %in, ptr addrspace(1) %out
3340  ret void
3341}
3342
3343define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
3344; GCN-LABEL: test_byval:
3345; GCN:       ; %bb.0:
3346; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3347; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v0
3348; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3349; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
3350; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3351; GCN-NEXT:    s_setpc_b64 s[30:31]
3352;
3353; GFX7-LABEL: test_byval:
3354; GFX7:       ; %bb.0:
3355; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v0
3357; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3358; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
3359; GFX7-NEXT:    s_waitcnt vmcnt(0)
3360; GFX7-NEXT:    s_setpc_b64 s[30:31]
3361;
3362; GFX8-LABEL: test_byval:
3363; GFX8:       ; %bb.0:
3364; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3365; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
3366; GFX8-NEXT:    s_waitcnt vmcnt(0)
3367; GFX8-NEXT:    s_setpc_b64 s[30:31]
3368;
3369; GFX9-LABEL: test_byval:
3370; GFX9:       ; %bb.0:
3371; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3372; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
3373; GFX9-NEXT:    s_waitcnt vmcnt(0)
3374; GFX9-NEXT:    s_setpc_b64 s[30:31]
3375;
3376; GFX10-LABEL: test_byval:
3377; GFX10:       ; %bb.0:
3378; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3379; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
3380; GFX10-NEXT:    s_setpc_b64 s[30:31]
3381;
3382; GFX11-LABEL: test_byval:
3383; GFX11:       ; %bb.0:
3384; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3385; GFX11-NEXT:    scratch_store_b16 off, v0, s32
3386; GFX11-NEXT:    s_setpc_b64 s[30:31]
3387  store bfloat %val, ptr addrspace(5) %bv
3388  %retval = load bfloat, ptr addrspace(5) %bv
3389  ret bfloat %retval
3390}
3391
3392define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
3393; GCN-LABEL: test_sret:
3394; GCN:       ; %bb.0:
3395; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3396; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3397; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3398; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
3399; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3400; GCN-NEXT:    s_setpc_b64 s[30:31]
3401;
3402; GFX7-LABEL: test_sret:
3403; GFX7:       ; %bb.0:
3404; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3405; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3406; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3407; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
3408; GFX7-NEXT:    s_waitcnt vmcnt(0)
3409; GFX7-NEXT:    s_setpc_b64 s[30:31]
3410;
3411; GFX8-LABEL: test_sret:
3412; GFX8:       ; %bb.0:
3413; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3414; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
3415; GFX8-NEXT:    s_waitcnt vmcnt(0)
3416; GFX8-NEXT:    s_setpc_b64 s[30:31]
3417;
3418; GFX9-LABEL: test_sret:
3419; GFX9:       ; %bb.0:
3420; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3421; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
3422; GFX9-NEXT:    s_waitcnt vmcnt(0)
3423; GFX9-NEXT:    s_setpc_b64 s[30:31]
3424;
3425; GFX10-LABEL: test_sret:
3426; GFX10:       ; %bb.0:
3427; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3428; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
3429; GFX10-NEXT:    s_setpc_b64 s[30:31]
3430;
3431; GFX11-LABEL: test_sret:
3432; GFX11:       ; %bb.0:
3433; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3434; GFX11-NEXT:    scratch_store_b16 v0, v1, off
3435; GFX11-NEXT:    s_setpc_b64 s[30:31]
3436  store bfloat %val, ptr addrspace(5) %sret
3437  ret void
3438}
3439
3440define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
3441; GCN-LABEL: test_bitcast_from_bfloat:
3442; GCN:       ; %bb.0:
3443; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444; GCN-NEXT:    s_mov_b32 s6, 0
3445; GCN-NEXT:    s_mov_b32 s7, 0xf000
3446; GCN-NEXT:    s_mov_b32 s4, s6
3447; GCN-NEXT:    s_mov_b32 s5, s6
3448; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3449; GCN-NEXT:    s_waitcnt vmcnt(0)
3450; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3451; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3452; GCN-NEXT:    s_setpc_b64 s[30:31]
3453;
3454; GFX7-LABEL: test_bitcast_from_bfloat:
3455; GFX7:       ; %bb.0:
3456; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3457; GFX7-NEXT:    s_mov_b32 s6, 0
3458; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3459; GFX7-NEXT:    s_mov_b32 s4, s6
3460; GFX7-NEXT:    s_mov_b32 s5, s6
3461; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3462; GFX7-NEXT:    s_waitcnt vmcnt(0)
3463; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3464; GFX7-NEXT:    s_waitcnt vmcnt(0)
3465; GFX7-NEXT:    s_setpc_b64 s[30:31]
3466;
3467; GFX8-LABEL: test_bitcast_from_bfloat:
3468; GFX8:       ; %bb.0:
3469; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3470; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
3471; GFX8-NEXT:    s_waitcnt vmcnt(0)
3472; GFX8-NEXT:    flat_store_short v[2:3], v0
3473; GFX8-NEXT:    s_waitcnt vmcnt(0)
3474; GFX8-NEXT:    s_setpc_b64 s[30:31]
3475;
3476; GFX9-LABEL: test_bitcast_from_bfloat:
3477; GFX9:       ; %bb.0:
3478; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3479; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
3480; GFX9-NEXT:    s_waitcnt vmcnt(0)
3481; GFX9-NEXT:    global_store_short v[2:3], v0, off
3482; GFX9-NEXT:    s_waitcnt vmcnt(0)
3483; GFX9-NEXT:    s_setpc_b64 s[30:31]
3484;
3485; GFX10-LABEL: test_bitcast_from_bfloat:
3486; GFX10:       ; %bb.0:
3487; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3488; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
3489; GFX10-NEXT:    s_waitcnt vmcnt(0)
3490; GFX10-NEXT:    global_store_short v[2:3], v0, off
3491; GFX10-NEXT:    s_setpc_b64 s[30:31]
3492;
3493; GFX11-LABEL: test_bitcast_from_bfloat:
3494; GFX11:       ; %bb.0:
3495; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3496; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
3497; GFX11-NEXT:    s_waitcnt vmcnt(0)
3498; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
3499; GFX11-NEXT:    s_setpc_b64 s[30:31]
3500  %val = load bfloat, ptr addrspace(1) %in
3501  %val_int = bitcast bfloat %val to i16
3502  store i16 %val_int, ptr addrspace(1) %out
3503  ret void
3504}
3505
3506define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
3507; GCN-LABEL: test_bitcast_to_bfloat:
3508; GCN:       ; %bb.0:
3509; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3510; GCN-NEXT:    s_mov_b32 s6, 0
3511; GCN-NEXT:    s_mov_b32 s7, 0xf000
3512; GCN-NEXT:    s_mov_b32 s4, s6
3513; GCN-NEXT:    s_mov_b32 s5, s6
3514; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3515; GCN-NEXT:    s_waitcnt vmcnt(0)
3516; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3517; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3518; GCN-NEXT:    s_setpc_b64 s[30:31]
3519;
3520; GFX7-LABEL: test_bitcast_to_bfloat:
3521; GFX7:       ; %bb.0:
3522; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3523; GFX7-NEXT:    s_mov_b32 s6, 0
3524; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3525; GFX7-NEXT:    s_mov_b32 s4, s6
3526; GFX7-NEXT:    s_mov_b32 s5, s6
3527; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3528; GFX7-NEXT:    s_waitcnt vmcnt(0)
3529; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3530; GFX7-NEXT:    s_waitcnt vmcnt(0)
3531; GFX7-NEXT:    s_setpc_b64 s[30:31]
3532;
3533; GFX8-LABEL: test_bitcast_to_bfloat:
3534; GFX8:       ; %bb.0:
3535; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3536; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
3537; GFX8-NEXT:    s_waitcnt vmcnt(0)
3538; GFX8-NEXT:    flat_store_short v[0:1], v2
3539; GFX8-NEXT:    s_waitcnt vmcnt(0)
3540; GFX8-NEXT:    s_setpc_b64 s[30:31]
3541;
3542; GFX9-LABEL: test_bitcast_to_bfloat:
3543; GFX9:       ; %bb.0:
3544; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3545; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
3546; GFX9-NEXT:    s_waitcnt vmcnt(0)
3547; GFX9-NEXT:    global_store_short v[0:1], v2, off
3548; GFX9-NEXT:    s_waitcnt vmcnt(0)
3549; GFX9-NEXT:    s_setpc_b64 s[30:31]
3550;
3551; GFX10-LABEL: test_bitcast_to_bfloat:
3552; GFX10:       ; %bb.0:
3553; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3554; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
3555; GFX10-NEXT:    s_waitcnt vmcnt(0)
3556; GFX10-NEXT:    global_store_short v[0:1], v2, off
3557; GFX10-NEXT:    s_setpc_b64 s[30:31]
3558;
3559; GFX11-LABEL: test_bitcast_to_bfloat:
3560; GFX11:       ; %bb.0:
3561; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3562; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
3563; GFX11-NEXT:    s_waitcnt vmcnt(0)
3564; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
3565; GFX11-NEXT:    s_setpc_b64 s[30:31]
3566  %val = load i16, ptr addrspace(1) %in
3567  %val_fp = bitcast i16 %val to bfloat
3568  store bfloat %val_fp, ptr addrspace(1) %out
3569  ret void
3570}
3571
3572define bfloat @test_ret(bfloat %in) {
3573; GCN-LABEL: test_ret:
3574; GCN:       ; %bb.0: ; %entry
3575; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3576; GCN-NEXT:    s_setpc_b64 s[30:31]
3577;
3578; GFX7-LABEL: test_ret:
3579; GFX7:       ; %bb.0: ; %entry
3580; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3581; GFX7-NEXT:    s_setpc_b64 s[30:31]
3582;
3583; GFX8-LABEL: test_ret:
3584; GFX8:       ; %bb.0: ; %entry
3585; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3586; GFX8-NEXT:    s_setpc_b64 s[30:31]
3587;
3588; GFX9-LABEL: test_ret:
3589; GFX9:       ; %bb.0: ; %entry
3590; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3591; GFX9-NEXT:    s_setpc_b64 s[30:31]
3592;
3593; GFX10-LABEL: test_ret:
3594; GFX10:       ; %bb.0: ; %entry
3595; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596; GFX10-NEXT:    s_setpc_b64 s[30:31]
3597;
3598; GFX11-LABEL: test_ret:
3599; GFX11:       ; %bb.0: ; %entry
3600; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3601; GFX11-NEXT:    s_setpc_b64 s[30:31]
3602entry:
3603  ret bfloat %in
3604}
3605
3606define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
3607; GCN-LABEL: test_ret_v2bf16:
3608; GCN:       ; %bb.0: ; %entry
3609; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3610; GCN-NEXT:    s_setpc_b64 s[30:31]
3611;
3612; GFX7-LABEL: test_ret_v2bf16:
3613; GFX7:       ; %bb.0: ; %entry
3614; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615; GFX7-NEXT:    s_setpc_b64 s[30:31]
3616;
3617; GFX8-LABEL: test_ret_v2bf16:
3618; GFX8:       ; %bb.0: ; %entry
3619; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3620; GFX8-NEXT:    s_setpc_b64 s[30:31]
3621;
3622; GFX9-LABEL: test_ret_v2bf16:
3623; GFX9:       ; %bb.0: ; %entry
3624; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3625; GFX9-NEXT:    s_setpc_b64 s[30:31]
3626;
3627; GFX10-LABEL: test_ret_v2bf16:
3628; GFX10:       ; %bb.0: ; %entry
3629; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3630; GFX10-NEXT:    s_setpc_b64 s[30:31]
3631;
3632; GFX11-LABEL: test_ret_v2bf16:
3633; GFX11:       ; %bb.0: ; %entry
3634; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3635; GFX11-NEXT:    s_setpc_b64 s[30:31]
3636entry:
3637  ret <2 x bfloat> %in
3638}
3639
3640define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
3641; GCN-LABEL: test_ret_v3bf16:
3642; GCN:       ; %bb.0: ; %entry
3643; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3644; GCN-NEXT:    s_setpc_b64 s[30:31]
3645;
3646; GFX7-LABEL: test_ret_v3bf16:
3647; GFX7:       ; %bb.0: ; %entry
3648; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3649; GFX7-NEXT:    s_setpc_b64 s[30:31]
3650;
3651; GFX8-LABEL: test_ret_v3bf16:
3652; GFX8:       ; %bb.0: ; %entry
3653; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654; GFX8-NEXT:    s_setpc_b64 s[30:31]
3655;
3656; GFX9-LABEL: test_ret_v3bf16:
3657; GFX9:       ; %bb.0: ; %entry
3658; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3659; GFX9-NEXT:    s_setpc_b64 s[30:31]
3660;
3661; GFX10-LABEL: test_ret_v3bf16:
3662; GFX10:       ; %bb.0: ; %entry
3663; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3664; GFX10-NEXT:    s_setpc_b64 s[30:31]
3665;
3666; GFX11-LABEL: test_ret_v3bf16:
3667; GFX11:       ; %bb.0: ; %entry
3668; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3669; GFX11-NEXT:    s_setpc_b64 s[30:31]
3670entry:
3671  ret <3 x bfloat> %in
3672}
3673
3674define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
3675; GCN-LABEL: test_ret_v4bf16:
3676; GCN:       ; %bb.0: ; %entry
3677; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3678; GCN-NEXT:    s_setpc_b64 s[30:31]
3679;
3680; GFX7-LABEL: test_ret_v4bf16:
3681; GFX7:       ; %bb.0: ; %entry
3682; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3683; GFX7-NEXT:    s_setpc_b64 s[30:31]
3684;
3685; GFX8-LABEL: test_ret_v4bf16:
3686; GFX8:       ; %bb.0: ; %entry
3687; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3688; GFX8-NEXT:    s_setpc_b64 s[30:31]
3689;
3690; GFX9-LABEL: test_ret_v4bf16:
3691; GFX9:       ; %bb.0: ; %entry
3692; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3693; GFX9-NEXT:    s_setpc_b64 s[30:31]
3694;
3695; GFX10-LABEL: test_ret_v4bf16:
3696; GFX10:       ; %bb.0: ; %entry
3697; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3698; GFX10-NEXT:    s_setpc_b64 s[30:31]
3699;
3700; GFX11-LABEL: test_ret_v4bf16:
3701; GFX11:       ; %bb.0: ; %entry
3702; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703; GFX11-NEXT:    s_setpc_b64 s[30:31]
3704entry:
3705  ret <4 x bfloat> %in
3706}
3707
3708define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
3709; GCN-LABEL: test_ret_v8bf16:
3710; GCN:       ; %bb.0: ; %entry
3711; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3712; GCN-NEXT:    s_setpc_b64 s[30:31]
3713;
3714; GFX7-LABEL: test_ret_v8bf16:
3715; GFX7:       ; %bb.0: ; %entry
3716; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3717; GFX7-NEXT:    s_setpc_b64 s[30:31]
3718;
3719; GFX8-LABEL: test_ret_v8bf16:
3720; GFX8:       ; %bb.0: ; %entry
3721; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3722; GFX8-NEXT:    s_setpc_b64 s[30:31]
3723;
3724; GFX9-LABEL: test_ret_v8bf16:
3725; GFX9:       ; %bb.0: ; %entry
3726; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3727; GFX9-NEXT:    s_setpc_b64 s[30:31]
3728;
3729; GFX10-LABEL: test_ret_v8bf16:
3730; GFX10:       ; %bb.0: ; %entry
3731; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3732; GFX10-NEXT:    s_setpc_b64 s[30:31]
3733;
3734; GFX11-LABEL: test_ret_v8bf16:
3735; GFX11:       ; %bb.0: ; %entry
3736; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3737; GFX11-NEXT:    s_setpc_b64 s[30:31]
3738entry:
3739  ret <8 x bfloat> %in
3740}
3741
3742define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
3743; GCN-LABEL: test_ret_v16bf16:
3744; GCN:       ; %bb.0: ; %entry
3745; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3746; GCN-NEXT:    s_setpc_b64 s[30:31]
3747;
3748; GFX7-LABEL: test_ret_v16bf16:
3749; GFX7:       ; %bb.0: ; %entry
3750; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3751; GFX7-NEXT:    s_setpc_b64 s[30:31]
3752;
3753; GFX8-LABEL: test_ret_v16bf16:
3754; GFX8:       ; %bb.0: ; %entry
3755; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3756; GFX8-NEXT:    s_setpc_b64 s[30:31]
3757;
3758; GFX9-LABEL: test_ret_v16bf16:
3759; GFX9:       ; %bb.0: ; %entry
3760; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3761; GFX9-NEXT:    s_setpc_b64 s[30:31]
3762;
3763; GFX10-LABEL: test_ret_v16bf16:
3764; GFX10:       ; %bb.0: ; %entry
3765; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3766; GFX10-NEXT:    s_setpc_b64 s[30:31]
3767;
3768; GFX11-LABEL: test_ret_v16bf16:
3769; GFX11:       ; %bb.0: ; %entry
3770; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3771; GFX11-NEXT:    s_setpc_b64 s[30:31]
3772entry:
3773  ret <16 x bfloat> %in
3774}
3775
3776define void @test_call(bfloat %in, ptr addrspace(5) %out) {
3777; GCN-LABEL: test_call:
3778; GCN:       ; %bb.0: ; %entry
3779; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3780; GCN-NEXT:    s_mov_b32 s18, s33
3781; GCN-NEXT:    s_mov_b32 s33, s32
3782; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
3783; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3784; GCN-NEXT:    s_mov_b64 exec, s[16:17]
3785; GCN-NEXT:    s_addk_i32 s32, 0x400
3786; GCN-NEXT:    s_waitcnt expcnt(0)
3787; GCN-NEXT:    v_writelane_b32 v2, s30, 0
3788; GCN-NEXT:    v_writelane_b32 v2, s31, 1
3789; GCN-NEXT:    s_getpc_b64 s[16:17]
3790; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3791; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3792; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3793; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3794; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3795; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3796; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3797; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
3798; GCN-NEXT:    s_waitcnt vmcnt(0)
3799; GCN-NEXT:    v_readlane_b32 s31, v2, 1
3800; GCN-NEXT:    v_readlane_b32 s30, v2, 0
3801; GCN-NEXT:    s_mov_b32 s32, s33
3802; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
3803; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3804; GCN-NEXT:    s_mov_b64 exec, s[4:5]
3805; GCN-NEXT:    s_mov_b32 s33, s18
3806; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3807; GCN-NEXT:    s_setpc_b64 s[30:31]
3808;
3809; GFX7-LABEL: test_call:
3810; GFX7:       ; %bb.0: ; %entry
3811; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3812; GFX7-NEXT:    s_mov_b32 s18, s33
3813; GFX7-NEXT:    s_mov_b32 s33, s32
3814; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
3815; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3816; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
3817; GFX7-NEXT:    s_addk_i32 s32, 0x400
3818; GFX7-NEXT:    s_getpc_b64 s[16:17]
3819; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3820; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3821; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3822; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
3823; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
3824; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3825; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3826; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3827; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3828; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
3829; GFX7-NEXT:    s_waitcnt vmcnt(0)
3830; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
3831; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
3832; GFX7-NEXT:    s_mov_b32 s32, s33
3833; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
3834; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3835; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
3836; GFX7-NEXT:    s_mov_b32 s33, s18
3837; GFX7-NEXT:    s_waitcnt vmcnt(0)
3838; GFX7-NEXT:    s_setpc_b64 s[30:31]
3839;
3840; GFX8-LABEL: test_call:
3841; GFX8:       ; %bb.0: ; %entry
3842; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3843; GFX8-NEXT:    s_mov_b32 s18, s33
3844; GFX8-NEXT:    s_mov_b32 s33, s32
3845; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
3846; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3847; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
3848; GFX8-NEXT:    s_addk_i32 s32, 0x400
3849; GFX8-NEXT:    s_getpc_b64 s[16:17]
3850; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3851; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3852; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3853; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
3854; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
3855; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3856; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3857; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
3858; GFX8-NEXT:    s_waitcnt vmcnt(0)
3859; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
3860; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
3861; GFX8-NEXT:    s_mov_b32 s32, s33
3862; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
3863; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3864; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3865; GFX8-NEXT:    s_mov_b32 s33, s18
3866; GFX8-NEXT:    s_waitcnt vmcnt(0)
3867; GFX8-NEXT:    s_setpc_b64 s[30:31]
3868;
3869; GFX9-LABEL: test_call:
3870; GFX9:       ; %bb.0: ; %entry
3871; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872; GFX9-NEXT:    s_mov_b32 s18, s33
3873; GFX9-NEXT:    s_mov_b32 s33, s32
3874; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
3875; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3876; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
3877; GFX9-NEXT:    s_addk_i32 s32, 0x400
3878; GFX9-NEXT:    s_getpc_b64 s[16:17]
3879; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3880; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3881; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3882; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
3883; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
3884; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3885; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3886; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
3887; GFX9-NEXT:    s_waitcnt vmcnt(0)
3888; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
3889; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
3890; GFX9-NEXT:    s_mov_b32 s32, s33
3891; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
3892; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3893; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3894; GFX9-NEXT:    s_mov_b32 s33, s18
3895; GFX9-NEXT:    s_waitcnt vmcnt(0)
3896; GFX9-NEXT:    s_setpc_b64 s[30:31]
3897;
3898; GFX10-LABEL: test_call:
3899; GFX10:       ; %bb.0: ; %entry
3900; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3901; GFX10-NEXT:    s_mov_b32 s18, s33
3902; GFX10-NEXT:    s_mov_b32 s33, s32
3903; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
3904; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3905; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3906; GFX10-NEXT:    s_mov_b32 exec_lo, s16
3907; GFX10-NEXT:    s_addk_i32 s32, 0x200
3908; GFX10-NEXT:    s_getpc_b64 s[16:17]
3909; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3910; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3911; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
3912; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3913; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
3914; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3915; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3916; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
3917; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3918; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
3919; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
3920; GFX10-NEXT:    s_mov_b32 s32, s33
3921; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
3922; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3923; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3924; GFX10-NEXT:    s_mov_b32 exec_lo, s4
3925; GFX10-NEXT:    s_mov_b32 s33, s18
3926; GFX10-NEXT:    s_waitcnt vmcnt(0)
3927; GFX10-NEXT:    s_setpc_b64 s[30:31]
3928;
3929; GFX11-LABEL: test_call:
3930; GFX11:       ; %bb.0: ; %entry
3931; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3932; GFX11-NEXT:    s_mov_b32 s2, s33
3933; GFX11-NEXT:    s_mov_b32 s33, s32
3934; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
3935; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
3936; GFX11-NEXT:    s_mov_b32 exec_lo, s0
3937; GFX11-NEXT:    s_add_i32 s32, s32, 16
3938; GFX11-NEXT:    s_getpc_b64 s[0:1]
3939; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
3940; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
3941; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
3942; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
3943; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
3944; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3945; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
3946; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
3947; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3948; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
3949; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
3950; GFX11-NEXT:    s_mov_b32 s32, s33
3951; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
3952; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
3953; GFX11-NEXT:    s_mov_b32 exec_lo, s0
3954; GFX11-NEXT:    s_mov_b32 s33, s2
3955; GFX11-NEXT:    s_waitcnt vmcnt(0)
3956; GFX11-NEXT:    s_setpc_b64 s[30:31]
3957entry:
3958  %result = call bfloat @test_arg_store(bfloat %in)
3959  store volatile bfloat %result, ptr addrspace(5) %out
3960  ret void
3961}
3962
3963define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
3964; GCN-LABEL: test_call_v2bf16:
3965; GCN:       ; %bb.0: ; %entry
3966; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3967; GCN-NEXT:    s_mov_b32 s18, s33
3968; GCN-NEXT:    s_mov_b32 s33, s32
3969; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
3970; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
3971; GCN-NEXT:    s_mov_b64 exec, s[16:17]
3972; GCN-NEXT:    s_addk_i32 s32, 0x400
3973; GCN-NEXT:    s_waitcnt expcnt(0)
3974; GCN-NEXT:    v_writelane_b32 v4, s30, 0
3975; GCN-NEXT:    v_writelane_b32 v4, s31, 1
3976; GCN-NEXT:    s_getpc_b64 s[16:17]
3977; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
3978; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
3979; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
3980; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3981; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
3982; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
3983; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
3984; GCN-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
3985; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3986; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3987; GCN-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
3988; GCN-NEXT:    s_waitcnt vmcnt(0)
3989; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
3990; GCN-NEXT:    s_waitcnt vmcnt(0)
3991; GCN-NEXT:    v_readlane_b32 s31, v4, 1
3992; GCN-NEXT:    v_readlane_b32 s30, v4, 0
3993; GCN-NEXT:    s_mov_b32 s32, s33
3994; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
3995; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
3996; GCN-NEXT:    s_mov_b64 exec, s[4:5]
3997; GCN-NEXT:    s_mov_b32 s33, s18
3998; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3999; GCN-NEXT:    s_setpc_b64 s[30:31]
4000;
4001; GFX7-LABEL: test_call_v2bf16:
4002; GFX7:       ; %bb.0: ; %entry
4003; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4004; GFX7-NEXT:    s_mov_b32 s18, s33
4005; GFX7-NEXT:    s_mov_b32 s33, s32
4006; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4007; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4008; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
4009; GFX7-NEXT:    s_addk_i32 s32, 0x400
4010; GFX7-NEXT:    s_getpc_b64 s[16:17]
4011; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4012; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4013; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4014; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
4015; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
4016; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4017; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4018; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4019; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4020; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4021; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
4022; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4023; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
4024; GFX7-NEXT:    s_waitcnt vmcnt(0)
4025; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
4026; GFX7-NEXT:    s_waitcnt vmcnt(0)
4027; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
4028; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
4029; GFX7-NEXT:    s_mov_b32 s32, s33
4030; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4031; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4032; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
4033; GFX7-NEXT:    s_mov_b32 s33, s18
4034; GFX7-NEXT:    s_waitcnt vmcnt(0)
4035; GFX7-NEXT:    s_setpc_b64 s[30:31]
4036;
4037; GFX8-LABEL: test_call_v2bf16:
4038; GFX8:       ; %bb.0: ; %entry
4039; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4040; GFX8-NEXT:    s_mov_b32 s18, s33
4041; GFX8-NEXT:    s_mov_b32 s33, s32
4042; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4043; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4044; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
4045; GFX8-NEXT:    s_addk_i32 s32, 0x400
4046; GFX8-NEXT:    s_getpc_b64 s[16:17]
4047; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4048; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4049; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4050; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
4051; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
4052; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4053; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4054; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
4055; GFX8-NEXT:    s_waitcnt vmcnt(0)
4056; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
4057; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
4058; GFX8-NEXT:    s_mov_b32 s32, s33
4059; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4060; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4061; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4062; GFX8-NEXT:    s_mov_b32 s33, s18
4063; GFX8-NEXT:    s_waitcnt vmcnt(0)
4064; GFX8-NEXT:    s_setpc_b64 s[30:31]
4065;
4066; GFX9-LABEL: test_call_v2bf16:
4067; GFX9:       ; %bb.0: ; %entry
4068; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4069; GFX9-NEXT:    s_mov_b32 s18, s33
4070; GFX9-NEXT:    s_mov_b32 s33, s32
4071; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4072; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4073; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
4074; GFX9-NEXT:    s_addk_i32 s32, 0x400
4075; GFX9-NEXT:    s_getpc_b64 s[16:17]
4076; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4077; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4078; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4079; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
4080; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
4081; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4082; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4083; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
4084; GFX9-NEXT:    s_waitcnt vmcnt(0)
4085; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
4086; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
4087; GFX9-NEXT:    s_mov_b32 s32, s33
4088; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4089; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4090; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4091; GFX9-NEXT:    s_mov_b32 s33, s18
4092; GFX9-NEXT:    s_waitcnt vmcnt(0)
4093; GFX9-NEXT:    s_setpc_b64 s[30:31]
4094;
4095; GFX10-LABEL: test_call_v2bf16:
4096; GFX10:       ; %bb.0: ; %entry
4097; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4098; GFX10-NEXT:    s_mov_b32 s18, s33
4099; GFX10-NEXT:    s_mov_b32 s33, s32
4100; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
4101; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4102; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4103; GFX10-NEXT:    s_mov_b32 exec_lo, s16
4104; GFX10-NEXT:    s_addk_i32 s32, 0x200
4105; GFX10-NEXT:    s_getpc_b64 s[16:17]
4106; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4107; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4108; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
4109; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4110; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
4111; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4113; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
4114; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4115; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
4116; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
4117; GFX10-NEXT:    s_mov_b32 s32, s33
4118; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
4119; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4120; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4121; GFX10-NEXT:    s_mov_b32 exec_lo, s4
4122; GFX10-NEXT:    s_mov_b32 s33, s18
4123; GFX10-NEXT:    s_waitcnt vmcnt(0)
4124; GFX10-NEXT:    s_setpc_b64 s[30:31]
4125;
4126; GFX11-LABEL: test_call_v2bf16:
4127; GFX11:       ; %bb.0: ; %entry
4128; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4129; GFX11-NEXT:    s_mov_b32 s2, s33
4130; GFX11-NEXT:    s_mov_b32 s33, s32
4131; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4132; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
4133; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4134; GFX11-NEXT:    s_add_i32 s32, s32, 16
4135; GFX11-NEXT:    s_getpc_b64 s[0:1]
4136; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4137; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4138; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
4139; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
4140; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
4141; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4142; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
4143; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
4144; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4145; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
4146; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
4147; GFX11-NEXT:    s_mov_b32 s32, s33
4148; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4149; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
4150; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4151; GFX11-NEXT:    s_mov_b32 s33, s2
4152; GFX11-NEXT:    s_waitcnt vmcnt(0)
4153; GFX11-NEXT:    s_setpc_b64 s[30:31]
4154entry:
4155  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
4156  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
4157  ret void
4158}
4159
4160define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
4161; GCN-LABEL: test_call_v3bf16:
4162; GCN:       ; %bb.0: ; %entry
4163; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4164; GCN-NEXT:    s_mov_b32 s18, s33
4165; GCN-NEXT:    s_mov_b32 s33, s32
4166; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4167; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4168; GCN-NEXT:    s_mov_b64 exec, s[16:17]
4169; GCN-NEXT:    s_addk_i32 s32, 0x400
4170; GCN-NEXT:    s_waitcnt expcnt(0)
4171; GCN-NEXT:    v_writelane_b32 v5, s30, 0
4172; GCN-NEXT:    v_writelane_b32 v5, s31, 1
4173; GCN-NEXT:    s_getpc_b64 s[16:17]
4174; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4175; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4176; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4177; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4178; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4179; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4180; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4181; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4182; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v3
4183; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4184; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4185; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
4186; GCN-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen
4187; GCN-NEXT:    s_waitcnt vmcnt(0)
4188; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
4189; GCN-NEXT:    s_waitcnt vmcnt(0)
4190; GCN-NEXT:    v_readlane_b32 s31, v5, 1
4191; GCN-NEXT:    v_readlane_b32 s30, v5, 0
4192; GCN-NEXT:    s_mov_b32 s32, s33
4193; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4194; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4195; GCN-NEXT:    s_mov_b64 exec, s[4:5]
4196; GCN-NEXT:    s_mov_b32 s33, s18
4197; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4198; GCN-NEXT:    s_setpc_b64 s[30:31]
4199;
4200; GFX7-LABEL: test_call_v3bf16:
4201; GFX7:       ; %bb.0: ; %entry
4202; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4203; GFX7-NEXT:    s_mov_b32 s18, s33
4204; GFX7-NEXT:    s_mov_b32 s33, s32
4205; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4206; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4207; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
4208; GFX7-NEXT:    s_addk_i32 s32, 0x400
4209; GFX7-NEXT:    s_getpc_b64 s[16:17]
4210; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4211; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4212; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4213; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
4214; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
4215; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4216; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4217; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4218; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4219; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4220; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
4221; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
4222; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4223; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
4224; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
4225; GFX7-NEXT:    s_waitcnt vmcnt(0)
4226; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
4227; GFX7-NEXT:    s_waitcnt vmcnt(0)
4228; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
4229; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
4230; GFX7-NEXT:    s_mov_b32 s32, s33
4231; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4232; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4233; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
4234; GFX7-NEXT:    s_mov_b32 s33, s18
4235; GFX7-NEXT:    s_waitcnt vmcnt(0)
4236; GFX7-NEXT:    s_setpc_b64 s[30:31]
4237;
4238; GFX8-LABEL: test_call_v3bf16:
4239; GFX8:       ; %bb.0: ; %entry
4240; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4241; GFX8-NEXT:    s_mov_b32 s18, s33
4242; GFX8-NEXT:    s_mov_b32 s33, s32
4243; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4244; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4245; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
4246; GFX8-NEXT:    s_addk_i32 s32, 0x400
4247; GFX8-NEXT:    s_getpc_b64 s[16:17]
4248; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4249; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4250; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4251; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
4252; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
4253; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4254; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4255; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
4256; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
4257; GFX8-NEXT:    s_waitcnt vmcnt(0)
4258; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4259; GFX8-NEXT:    s_waitcnt vmcnt(0)
4260; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
4261; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
4262; GFX8-NEXT:    s_mov_b32 s32, s33
4263; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4264; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4265; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4266; GFX8-NEXT:    s_mov_b32 s33, s18
4267; GFX8-NEXT:    s_waitcnt vmcnt(0)
4268; GFX8-NEXT:    s_setpc_b64 s[30:31]
4269;
4270; GFX9-LABEL: test_call_v3bf16:
4271; GFX9:       ; %bb.0: ; %entry
4272; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4273; GFX9-NEXT:    s_mov_b32 s18, s33
4274; GFX9-NEXT:    s_mov_b32 s33, s32
4275; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4276; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4277; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
4278; GFX9-NEXT:    s_addk_i32 s32, 0x400
4279; GFX9-NEXT:    s_getpc_b64 s[16:17]
4280; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4281; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4282; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4283; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
4284; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
4285; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4286; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4287; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4288; GFX9-NEXT:    s_waitcnt vmcnt(0)
4289; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4290; GFX9-NEXT:    s_waitcnt vmcnt(0)
4291; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
4292; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
4293; GFX9-NEXT:    s_mov_b32 s32, s33
4294; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4295; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4296; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4297; GFX9-NEXT:    s_mov_b32 s33, s18
4298; GFX9-NEXT:    s_waitcnt vmcnt(0)
4299; GFX9-NEXT:    s_setpc_b64 s[30:31]
4300;
4301; GFX10-LABEL: test_call_v3bf16:
4302; GFX10:       ; %bb.0: ; %entry
4303; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4304; GFX10-NEXT:    s_mov_b32 s18, s33
4305; GFX10-NEXT:    s_mov_b32 s33, s32
4306; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
4307; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4308; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4309; GFX10-NEXT:    s_mov_b32 exec_lo, s16
4310; GFX10-NEXT:    s_addk_i32 s32, 0x200
4311; GFX10-NEXT:    s_getpc_b64 s[16:17]
4312; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4313; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4314; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
4315; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4316; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
4317; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4318; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4319; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4320; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4321; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4322; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4323; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
4324; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
4325; GFX10-NEXT:    s_mov_b32 s32, s33
4326; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
4327; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4328; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4329; GFX10-NEXT:    s_mov_b32 exec_lo, s4
4330; GFX10-NEXT:    s_mov_b32 s33, s18
4331; GFX10-NEXT:    s_waitcnt vmcnt(0)
4332; GFX10-NEXT:    s_setpc_b64 s[30:31]
4333;
4334; GFX11-LABEL: test_call_v3bf16:
4335; GFX11:       ; %bb.0: ; %entry
4336; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4337; GFX11-NEXT:    s_mov_b32 s2, s33
4338; GFX11-NEXT:    s_mov_b32 s33, s32
4339; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4340; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4341; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4342; GFX11-NEXT:    s_add_i32 s32, s32, 16
4343; GFX11-NEXT:    s_getpc_b64 s[0:1]
4344; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4345; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4346; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
4347; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
4348; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
4349; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4350; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
4351; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
4352; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4353; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
4354; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4355; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
4356; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
4357; GFX11-NEXT:    s_mov_b32 s32, s33
4358; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4359; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4360; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4361; GFX11-NEXT:    s_mov_b32 s33, s2
4362; GFX11-NEXT:    s_waitcnt vmcnt(0)
4363; GFX11-NEXT:    s_setpc_b64 s[30:31]
4364entry:
4365  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
4366  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
4367  ret void
4368}
4369
4370define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
4371; GCN-LABEL: test_call_v4bf16:
4372; GCN:       ; %bb.0: ; %entry
4373; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4374; GCN-NEXT:    s_mov_b32 s18, s33
4375; GCN-NEXT:    s_mov_b32 s33, s32
4376; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4377; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
4378; GCN-NEXT:    s_mov_b64 exec, s[16:17]
4379; GCN-NEXT:    s_addk_i32 s32, 0x400
4380; GCN-NEXT:    s_waitcnt expcnt(0)
4381; GCN-NEXT:    v_writelane_b32 v8, s30, 0
4382; GCN-NEXT:    v_writelane_b32 v8, s31, 1
4383; GCN-NEXT:    s_getpc_b64 s[16:17]
4384; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4385; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4386; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4387; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4388; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4389; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4390; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4391; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4392; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
4393; GCN-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
4394; GCN-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
4395; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v4
4396; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4397; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4398; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4399; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
4400; GCN-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
4401; GCN-NEXT:    s_waitcnt vmcnt(0)
4402; GCN-NEXT:    buffer_store_short v2, v6, s[0:3], 0 offen
4403; GCN-NEXT:    s_waitcnt vmcnt(0)
4404; GCN-NEXT:    buffer_store_short v1, v7, s[0:3], 0 offen
4405; GCN-NEXT:    s_waitcnt vmcnt(0)
4406; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
4407; GCN-NEXT:    s_waitcnt vmcnt(0)
4408; GCN-NEXT:    v_readlane_b32 s31, v8, 1
4409; GCN-NEXT:    v_readlane_b32 s30, v8, 0
4410; GCN-NEXT:    s_mov_b32 s32, s33
4411; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4412; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
4413; GCN-NEXT:    s_mov_b64 exec, s[4:5]
4414; GCN-NEXT:    s_mov_b32 s33, s18
4415; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4416; GCN-NEXT:    s_setpc_b64 s[30:31]
4417;
4418; GFX7-LABEL: test_call_v4bf16:
4419; GFX7:       ; %bb.0: ; %entry
4420; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4421; GFX7-NEXT:    s_mov_b32 s18, s33
4422; GFX7-NEXT:    s_mov_b32 s33, s32
4423; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4424; GFX7-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4425; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
4426; GFX7-NEXT:    s_addk_i32 s32, 0x400
4427; GFX7-NEXT:    s_getpc_b64 s[16:17]
4428; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4429; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4430; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4431; GFX7-NEXT:    v_writelane_b32 v6, s30, 0
4432; GFX7-NEXT:    v_writelane_b32 v6, s31, 1
4433; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4434; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4435; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
4436; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4437; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
4438; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
4439; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4440; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4441; GFX7-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
4442; GFX7-NEXT:    s_waitcnt vmcnt(0)
4443; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
4444; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4445; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4446; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
4447; GFX7-NEXT:    s_waitcnt vmcnt(0)
4448; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
4449; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4450; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
4451; GFX7-NEXT:    s_waitcnt vmcnt(0)
4452; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
4453; GFX7-NEXT:    s_waitcnt vmcnt(0)
4454; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
4455; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
4456; GFX7-NEXT:    s_mov_b32 s32, s33
4457; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4458; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4459; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
4460; GFX7-NEXT:    s_mov_b32 s33, s18
4461; GFX7-NEXT:    s_waitcnt vmcnt(0)
4462; GFX7-NEXT:    s_setpc_b64 s[30:31]
4463;
4464; GFX8-LABEL: test_call_v4bf16:
4465; GFX8:       ; %bb.0: ; %entry
4466; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4467; GFX8-NEXT:    s_mov_b32 s18, s33
4468; GFX8-NEXT:    s_mov_b32 s33, s32
4469; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4470; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4471; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
4472; GFX8-NEXT:    s_addk_i32 s32, 0x400
4473; GFX8-NEXT:    s_getpc_b64 s[16:17]
4474; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4475; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4476; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4477; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
4478; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
4479; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4480; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4481; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
4482; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
4483; GFX8-NEXT:    s_waitcnt vmcnt(0)
4484; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4485; GFX8-NEXT:    s_waitcnt vmcnt(0)
4486; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
4487; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
4488; GFX8-NEXT:    s_mov_b32 s32, s33
4489; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4490; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4491; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4492; GFX8-NEXT:    s_mov_b32 s33, s18
4493; GFX8-NEXT:    s_waitcnt vmcnt(0)
4494; GFX8-NEXT:    s_setpc_b64 s[30:31]
4495;
4496; GFX9-LABEL: test_call_v4bf16:
4497; GFX9:       ; %bb.0: ; %entry
4498; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4499; GFX9-NEXT:    s_mov_b32 s18, s33
4500; GFX9-NEXT:    s_mov_b32 s33, s32
4501; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4502; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4503; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
4504; GFX9-NEXT:    s_addk_i32 s32, 0x400
4505; GFX9-NEXT:    s_getpc_b64 s[16:17]
4506; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4507; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4508; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4509; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
4510; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
4511; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4513; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4514; GFX9-NEXT:    s_waitcnt vmcnt(0)
4515; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4516; GFX9-NEXT:    s_waitcnt vmcnt(0)
4517; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
4518; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
4519; GFX9-NEXT:    s_mov_b32 s32, s33
4520; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4521; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4522; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4523; GFX9-NEXT:    s_mov_b32 s33, s18
4524; GFX9-NEXT:    s_waitcnt vmcnt(0)
4525; GFX9-NEXT:    s_setpc_b64 s[30:31]
4526;
4527; GFX10-LABEL: test_call_v4bf16:
4528; GFX10:       ; %bb.0: ; %entry
4529; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4530; GFX10-NEXT:    s_mov_b32 s18, s33
4531; GFX10-NEXT:    s_mov_b32 s33, s32
4532; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
4533; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4534; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4535; GFX10-NEXT:    s_mov_b32 exec_lo, s16
4536; GFX10-NEXT:    s_addk_i32 s32, 0x200
4537; GFX10-NEXT:    s_getpc_b64 s[16:17]
4538; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4539; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4540; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
4541; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4542; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
4543; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4544; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4545; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4546; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4547; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4548; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4549; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
4550; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
4551; GFX10-NEXT:    s_mov_b32 s32, s33
4552; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
4553; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4554; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4555; GFX10-NEXT:    s_mov_b32 exec_lo, s4
4556; GFX10-NEXT:    s_mov_b32 s33, s18
4557; GFX10-NEXT:    s_waitcnt vmcnt(0)
4558; GFX10-NEXT:    s_setpc_b64 s[30:31]
4559;
4560; GFX11-LABEL: test_call_v4bf16:
4561; GFX11:       ; %bb.0: ; %entry
4562; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4563; GFX11-NEXT:    s_mov_b32 s2, s33
4564; GFX11-NEXT:    s_mov_b32 s33, s32
4565; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4566; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4567; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4568; GFX11-NEXT:    s_add_i32 s32, s32, 16
4569; GFX11-NEXT:    s_getpc_b64 s[0:1]
4570; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4571; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4572; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
4573; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
4574; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
4575; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4576; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
4577; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
4578; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4579; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
4580; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
4581; GFX11-NEXT:    s_mov_b32 s32, s33
4582; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4583; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4584; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4585; GFX11-NEXT:    s_mov_b32 s33, s2
4586; GFX11-NEXT:    s_waitcnt vmcnt(0)
4587; GFX11-NEXT:    s_setpc_b64 s[30:31]
4588entry:
4589  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
4590  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
4591  ret void
4592}
4593
4594define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
4595; GCN-LABEL: test_call_v8bf16:
4596; GCN:       ; %bb.0: ; %entry
4597; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4598; GCN-NEXT:    s_mov_b32 s18, s33
4599; GCN-NEXT:    s_mov_b32 s33, s32
4600; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4601; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
4602; GCN-NEXT:    s_mov_b64 exec, s[16:17]
4603; GCN-NEXT:    s_addk_i32 s32, 0x400
4604; GCN-NEXT:    s_waitcnt expcnt(0)
4605; GCN-NEXT:    v_writelane_b32 v16, s30, 0
4606; GCN-NEXT:    v_writelane_b32 v16, s31, 1
4607; GCN-NEXT:    s_getpc_b64 s[16:17]
4608; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4609; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4610; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4611; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4612; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4613; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4614; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4615; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4616; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
4617; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
4618; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
4619; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
4620; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
4621; GCN-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
4622; GCN-NEXT:    v_add_i32_e32 v10, vcc, 12, v8
4623; GCN-NEXT:    v_add_i32_e32 v11, vcc, 10, v8
4624; GCN-NEXT:    v_add_i32_e32 v12, vcc, 8, v8
4625; GCN-NEXT:    v_add_i32_e32 v13, vcc, 6, v8
4626; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v8
4627; GCN-NEXT:    v_add_i32_e32 v15, vcc, 2, v8
4628; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4629; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4630; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4631; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
4632; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
4633; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
4634; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
4635; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
4636; GCN-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
4637; GCN-NEXT:    s_waitcnt vmcnt(0)
4638; GCN-NEXT:    buffer_store_short v6, v10, s[0:3], 0 offen
4639; GCN-NEXT:    s_waitcnt vmcnt(0)
4640; GCN-NEXT:    buffer_store_short v5, v11, s[0:3], 0 offen
4641; GCN-NEXT:    s_waitcnt vmcnt(0)
4642; GCN-NEXT:    buffer_store_short v4, v12, s[0:3], 0 offen
4643; GCN-NEXT:    s_waitcnt vmcnt(0)
4644; GCN-NEXT:    buffer_store_short v3, v13, s[0:3], 0 offen
4645; GCN-NEXT:    s_waitcnt vmcnt(0)
4646; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
4647; GCN-NEXT:    s_waitcnt vmcnt(0)
4648; GCN-NEXT:    buffer_store_short v1, v15, s[0:3], 0 offen
4649; GCN-NEXT:    s_waitcnt vmcnt(0)
4650; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
4651; GCN-NEXT:    s_waitcnt vmcnt(0)
4652; GCN-NEXT:    v_readlane_b32 s31, v16, 1
4653; GCN-NEXT:    v_readlane_b32 s30, v16, 0
4654; GCN-NEXT:    s_mov_b32 s32, s33
4655; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4656; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
4657; GCN-NEXT:    s_mov_b64 exec, s[4:5]
4658; GCN-NEXT:    s_mov_b32 s33, s18
4659; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4660; GCN-NEXT:    s_setpc_b64 s[30:31]
4661;
4662; GFX7-LABEL: test_call_v8bf16:
4663; GFX7:       ; %bb.0: ; %entry
4664; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4665; GFX7-NEXT:    s_mov_b32 s18, s33
4666; GFX7-NEXT:    s_mov_b32 s33, s32
4667; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4668; GFX7-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
4669; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
4670; GFX7-NEXT:    s_addk_i32 s32, 0x400
4671; GFX7-NEXT:    s_getpc_b64 s[16:17]
4672; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4673; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4674; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4675; GFX7-NEXT:    v_writelane_b32 v10, s30, 0
4676; GFX7-NEXT:    v_writelane_b32 v10, s31, 1
4677; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4678; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4679; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
4680; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
4681; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
4682; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
4683; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
4684; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
4685; GFX7-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
4686; GFX7-NEXT:    s_waitcnt vmcnt(0)
4687; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
4688; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
4689; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
4690; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
4691; GFX7-NEXT:    s_waitcnt vmcnt(0)
4692; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v8
4693; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
4694; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
4695; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
4696; GFX7-NEXT:    s_waitcnt vmcnt(0)
4697; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v8
4698; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4699; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
4700; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
4701; GFX7-NEXT:    s_waitcnt vmcnt(0)
4702; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v8
4703; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4704; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4705; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
4706; GFX7-NEXT:    s_waitcnt vmcnt(0)
4707; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v8
4708; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4709; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4710; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
4711; GFX7-NEXT:    s_waitcnt vmcnt(0)
4712; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
4713; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4714; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
4715; GFX7-NEXT:    s_waitcnt vmcnt(0)
4716; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
4717; GFX7-NEXT:    s_waitcnt vmcnt(0)
4718; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
4719; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
4720; GFX7-NEXT:    s_mov_b32 s32, s33
4721; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4722; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
4723; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
4724; GFX7-NEXT:    s_mov_b32 s33, s18
4725; GFX7-NEXT:    s_waitcnt vmcnt(0)
4726; GFX7-NEXT:    s_setpc_b64 s[30:31]
4727;
4728; GFX8-LABEL: test_call_v8bf16:
4729; GFX8:       ; %bb.0: ; %entry
4730; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4731; GFX8-NEXT:    s_mov_b32 s18, s33
4732; GFX8-NEXT:    s_mov_b32 s33, s32
4733; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4734; GFX8-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4735; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
4736; GFX8-NEXT:    s_addk_i32 s32, 0x400
4737; GFX8-NEXT:    s_getpc_b64 s[16:17]
4738; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4739; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4740; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4741; GFX8-NEXT:    v_writelane_b32 v6, s30, 0
4742; GFX8-NEXT:    v_writelane_b32 v6, s31, 1
4743; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4744; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4745; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 12, v4
4746; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
4747; GFX8-NEXT:    s_waitcnt vmcnt(0)
4748; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
4749; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
4750; GFX8-NEXT:    s_waitcnt vmcnt(0)
4751; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
4752; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
4753; GFX8-NEXT:    s_waitcnt vmcnt(0)
4754; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4755; GFX8-NEXT:    s_waitcnt vmcnt(0)
4756; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
4757; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
4758; GFX8-NEXT:    s_mov_b32 s32, s33
4759; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4760; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4761; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4762; GFX8-NEXT:    s_mov_b32 s33, s18
4763; GFX8-NEXT:    s_waitcnt vmcnt(0)
4764; GFX8-NEXT:    s_setpc_b64 s[30:31]
4765;
4766; GFX9-LABEL: test_call_v8bf16:
4767; GFX9:       ; %bb.0: ; %entry
4768; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4769; GFX9-NEXT:    s_mov_b32 s18, s33
4770; GFX9-NEXT:    s_mov_b32 s33, s32
4771; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4772; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4773; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
4774; GFX9-NEXT:    s_addk_i32 s32, 0x400
4775; GFX9-NEXT:    s_getpc_b64 s[16:17]
4776; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4777; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4778; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4779; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
4780; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
4781; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4782; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4783; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4784; GFX9-NEXT:    s_waitcnt vmcnt(0)
4785; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4786; GFX9-NEXT:    s_waitcnt vmcnt(0)
4787; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4788; GFX9-NEXT:    s_waitcnt vmcnt(0)
4789; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4790; GFX9-NEXT:    s_waitcnt vmcnt(0)
4791; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
4792; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
4793; GFX9-NEXT:    s_mov_b32 s32, s33
4794; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4795; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4796; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4797; GFX9-NEXT:    s_mov_b32 s33, s18
4798; GFX9-NEXT:    s_waitcnt vmcnt(0)
4799; GFX9-NEXT:    s_setpc_b64 s[30:31]
4800;
4801; GFX10-LABEL: test_call_v8bf16:
4802; GFX10:       ; %bb.0: ; %entry
4803; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4804; GFX10-NEXT:    s_mov_b32 s18, s33
4805; GFX10-NEXT:    s_mov_b32 s33, s32
4806; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
4807; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4808; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4809; GFX10-NEXT:    s_mov_b32 exec_lo, s16
4810; GFX10-NEXT:    s_addk_i32 s32, 0x200
4811; GFX10-NEXT:    s_getpc_b64 s[16:17]
4812; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4813; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4814; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
4815; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4816; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
4817; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4818; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4819; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4820; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4821; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4822; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4823; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4824; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4825; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4826; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4827; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
4828; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
4829; GFX10-NEXT:    s_mov_b32 s32, s33
4830; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
4831; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4832; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4833; GFX10-NEXT:    s_mov_b32 exec_lo, s4
4834; GFX10-NEXT:    s_mov_b32 s33, s18
4835; GFX10-NEXT:    s_waitcnt vmcnt(0)
4836; GFX10-NEXT:    s_setpc_b64 s[30:31]
4837;
4838; GFX11-LABEL: test_call_v8bf16:
4839; GFX11:       ; %bb.0: ; %entry
4840; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4841; GFX11-NEXT:    s_mov_b32 s2, s33
4842; GFX11-NEXT:    s_mov_b32 s33, s32
4843; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4844; GFX11-NEXT:    scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
4845; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4846; GFX11-NEXT:    s_add_i32 s32, s32, 16
4847; GFX11-NEXT:    s_getpc_b64 s[0:1]
4848; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4849; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4850; GFX11-NEXT:    v_writelane_b32 v5, s30, 0
4851; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
4852; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
4853; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4854; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
4855; GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
4856; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4857; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
4858; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
4859; GFX11-NEXT:    s_mov_b32 s32, s33
4860; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
4861; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
4862; GFX11-NEXT:    s_mov_b32 exec_lo, s0
4863; GFX11-NEXT:    s_mov_b32 s33, s2
4864; GFX11-NEXT:    s_waitcnt vmcnt(0)
4865; GFX11-NEXT:    s_setpc_b64 s[30:31]
4866entry:
4867  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
4868  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
4869  ret void
4870}
4871
4872define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
4873; GCN-LABEL: test_call_v16bf16:
4874; GCN:       ; %bb.0: ; %entry
4875; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4876; GCN-NEXT:    s_mov_b32 s18, s33
4877; GCN-NEXT:    s_mov_b32 s33, s32
4878; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4879; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
4880; GCN-NEXT:    s_mov_b64 exec, s[16:17]
4881; GCN-NEXT:    s_addk_i32 s32, 0x400
4882; GCN-NEXT:    s_waitcnt expcnt(0)
4883; GCN-NEXT:    v_writelane_b32 v20, s30, 0
4884; GCN-NEXT:    v_writelane_b32 v20, s31, 1
4885; GCN-NEXT:    s_getpc_b64 s[16:17]
4886; GCN-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4887; GCN-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4888; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4889; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4890; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4891; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4892; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
4893; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
4894; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
4895; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
4896; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
4897; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
4898; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
4899; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
4900; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
4901; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
4902; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
4903; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
4904; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
4905; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
4906; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
4907; GCN-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
4908; GCN-NEXT:    v_add_i32_e32 v18, vcc, 28, v16
4909; GCN-NEXT:    v_add_i32_e32 v19, vcc, 26, v16
4910; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
4911; GCN-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
4912; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4913; GCN-NEXT:    v_add_i32_e32 v15, vcc, 24, v16
4914; GCN-NEXT:    v_add_i32_e32 v17, vcc, 22, v16
4915; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
4916; GCN-NEXT:    buffer_store_short v14, v18, s[0:3], 0 offen
4917; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4918; GCN-NEXT:    v_add_i32_e32 v14, vcc, 20, v16
4919; GCN-NEXT:    v_add_i32_e32 v18, vcc, 18, v16
4920; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
4921; GCN-NEXT:    buffer_store_short v13, v19, s[0:3], 0 offen
4922; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4923; GCN-NEXT:    v_add_i32_e32 v13, vcc, 16, v16
4924; GCN-NEXT:    v_add_i32_e32 v19, vcc, 14, v16
4925; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
4926; GCN-NEXT:    buffer_store_short v12, v15, s[0:3], 0 offen
4927; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4928; GCN-NEXT:    v_add_i32_e32 v12, vcc, 12, v16
4929; GCN-NEXT:    v_add_i32_e32 v15, vcc, 10, v16
4930; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
4931; GCN-NEXT:    buffer_store_short v11, v17, s[0:3], 0 offen
4932; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4933; GCN-NEXT:    v_add_i32_e32 v11, vcc, 8, v16
4934; GCN-NEXT:    v_add_i32_e32 v17, vcc, 6, v16
4935; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
4936; GCN-NEXT:    buffer_store_short v10, v14, s[0:3], 0 offen
4937; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4938; GCN-NEXT:    v_add_i32_e32 v10, vcc, 4, v16
4939; GCN-NEXT:    v_add_i32_e32 v14, vcc, 2, v16
4940; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4941; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
4942; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
4943; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
4944; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
4945; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
4946; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
4947; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
4948; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
4949; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
4950; GCN-NEXT:    buffer_store_short v9, v18, s[0:3], 0 offen
4951; GCN-NEXT:    s_waitcnt vmcnt(0)
4952; GCN-NEXT:    buffer_store_short v8, v13, s[0:3], 0 offen
4953; GCN-NEXT:    s_waitcnt vmcnt(0)
4954; GCN-NEXT:    buffer_store_short v7, v19, s[0:3], 0 offen
4955; GCN-NEXT:    s_waitcnt vmcnt(0)
4956; GCN-NEXT:    buffer_store_short v6, v12, s[0:3], 0 offen
4957; GCN-NEXT:    s_waitcnt vmcnt(0)
4958; GCN-NEXT:    buffer_store_short v5, v15, s[0:3], 0 offen
4959; GCN-NEXT:    s_waitcnt vmcnt(0)
4960; GCN-NEXT:    buffer_store_short v4, v11, s[0:3], 0 offen
4961; GCN-NEXT:    s_waitcnt vmcnt(0)
4962; GCN-NEXT:    buffer_store_short v3, v17, s[0:3], 0 offen
4963; GCN-NEXT:    s_waitcnt vmcnt(0)
4964; GCN-NEXT:    buffer_store_short v2, v10, s[0:3], 0 offen
4965; GCN-NEXT:    s_waitcnt vmcnt(0)
4966; GCN-NEXT:    buffer_store_short v1, v14, s[0:3], 0 offen
4967; GCN-NEXT:    s_waitcnt vmcnt(0)
4968; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
4969; GCN-NEXT:    s_waitcnt vmcnt(0)
4970; GCN-NEXT:    v_readlane_b32 s31, v20, 1
4971; GCN-NEXT:    v_readlane_b32 s30, v20, 0
4972; GCN-NEXT:    s_mov_b32 s32, s33
4973; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
4974; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
4975; GCN-NEXT:    s_mov_b64 exec, s[4:5]
4976; GCN-NEXT:    s_mov_b32 s33, s18
4977; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4978; GCN-NEXT:    s_setpc_b64 s[30:31]
4979;
4980; GFX7-LABEL: test_call_v16bf16:
4981; GFX7:       ; %bb.0: ; %entry
4982; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4983; GFX7-NEXT:    s_mov_b32 s18, s33
4984; GFX7-NEXT:    s_mov_b32 s33, s32
4985; GFX7-NEXT:    s_xor_saveexec_b64 s[16:17], -1
4986; GFX7-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
4987; GFX7-NEXT:    s_mov_b64 exec, s[16:17]
4988; GFX7-NEXT:    s_addk_i32 s32, 0x400
4989; GFX7-NEXT:    s_getpc_b64 s[16:17]
4990; GFX7-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4991; GFX7-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4992; GFX7-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
4993; GFX7-NEXT:    v_writelane_b32 v18, s30, 0
4994; GFX7-NEXT:    v_writelane_b32 v18, s31, 1
4995; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4996; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
4997; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
4998; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
4999; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
5000; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
5001; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
5002; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
5003; GFX7-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
5004; GFX7-NEXT:    s_waitcnt vmcnt(0)
5005; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
5006; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
5007; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
5008; GFX7-NEXT:    buffer_store_short v14, v15, s[0:3], 0 offen
5009; GFX7-NEXT:    s_waitcnt vmcnt(0)
5010; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 26, v16
5011; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
5012; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
5013; GFX7-NEXT:    buffer_store_short v13, v14, s[0:3], 0 offen
5014; GFX7-NEXT:    s_waitcnt vmcnt(0)
5015; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 24, v16
5016; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
5017; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
5018; GFX7-NEXT:    buffer_store_short v12, v13, s[0:3], 0 offen
5019; GFX7-NEXT:    s_waitcnt vmcnt(0)
5020; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 22, v16
5021; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
5022; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
5023; GFX7-NEXT:    buffer_store_short v11, v12, s[0:3], 0 offen
5024; GFX7-NEXT:    s_waitcnt vmcnt(0)
5025; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 20, v16
5026; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
5027; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
5028; GFX7-NEXT:    buffer_store_short v10, v11, s[0:3], 0 offen
5029; GFX7-NEXT:    s_waitcnt vmcnt(0)
5030; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 18, v16
5031; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
5032; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
5033; GFX7-NEXT:    buffer_store_short v9, v10, s[0:3], 0 offen
5034; GFX7-NEXT:    s_waitcnt vmcnt(0)
5035; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 16, v16
5036; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
5037; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
5038; GFX7-NEXT:    buffer_store_short v8, v9, s[0:3], 0 offen
5039; GFX7-NEXT:    s_waitcnt vmcnt(0)
5040; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 14, v16
5041; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
5042; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
5043; GFX7-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen
5044; GFX7-NEXT:    s_waitcnt vmcnt(0)
5045; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v16
5046; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
5047; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
5048; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
5049; GFX7-NEXT:    s_waitcnt vmcnt(0)
5050; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v16
5051; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
5052; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
5053; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
5054; GFX7-NEXT:    s_waitcnt vmcnt(0)
5055; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v16
5056; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
5057; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
5058; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
5059; GFX7-NEXT:    s_waitcnt vmcnt(0)
5060; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v16
5061; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
5062; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
5063; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
5064; GFX7-NEXT:    s_waitcnt vmcnt(0)
5065; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v16
5066; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5067; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
5068; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
5069; GFX7-NEXT:    s_waitcnt vmcnt(0)
5070; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
5071; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5072; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
5073; GFX7-NEXT:    s_waitcnt vmcnt(0)
5074; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
5075; GFX7-NEXT:    s_waitcnt vmcnt(0)
5076; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
5077; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
5078; GFX7-NEXT:    s_mov_b32 s32, s33
5079; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
5080; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
5081; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
5082; GFX7-NEXT:    s_mov_b32 s33, s18
5083; GFX7-NEXT:    s_waitcnt vmcnt(0)
5084; GFX7-NEXT:    s_setpc_b64 s[30:31]
5085;
5086; GFX8-LABEL: test_call_v16bf16:
5087; GFX8:       ; %bb.0: ; %entry
5088; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5089; GFX8-NEXT:    s_mov_b32 s18, s33
5090; GFX8-NEXT:    s_mov_b32 s33, s32
5091; GFX8-NEXT:    s_xor_saveexec_b64 s[16:17], -1
5092; GFX8-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
5093; GFX8-NEXT:    s_mov_b64 exec, s[16:17]
5094; GFX8-NEXT:    s_addk_i32 s32, 0x400
5095; GFX8-NEXT:    s_getpc_b64 s[16:17]
5096; GFX8-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5097; GFX8-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5098; GFX8-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
5099; GFX8-NEXT:    v_writelane_b32 v10, s30, 0
5100; GFX8-NEXT:    v_writelane_b32 v10, s31, 1
5101; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5102; GFX8-NEXT:    s_swappc_b64 s[30:31], s[16:17]
5103; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 28, v8
5104; GFX8-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
5105; GFX8-NEXT:    s_waitcnt vmcnt(0)
5106; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
5107; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
5108; GFX8-NEXT:    s_waitcnt vmcnt(0)
5109; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
5110; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
5111; GFX8-NEXT:    s_waitcnt vmcnt(0)
5112; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
5113; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
5114; GFX8-NEXT:    s_waitcnt vmcnt(0)
5115; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
5116; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
5117; GFX8-NEXT:    s_waitcnt vmcnt(0)
5118; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
5119; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
5120; GFX8-NEXT:    s_waitcnt vmcnt(0)
5121; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
5122; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
5123; GFX8-NEXT:    s_waitcnt vmcnt(0)
5124; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
5125; GFX8-NEXT:    s_waitcnt vmcnt(0)
5126; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
5127; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
5128; GFX8-NEXT:    s_mov_b32 s32, s33
5129; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
5130; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
5131; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
5132; GFX8-NEXT:    s_mov_b32 s33, s18
5133; GFX8-NEXT:    s_waitcnt vmcnt(0)
5134; GFX8-NEXT:    s_setpc_b64 s[30:31]
5135;
5136; GFX9-LABEL: test_call_v16bf16:
5137; GFX9:       ; %bb.0: ; %entry
5138; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5139; GFX9-NEXT:    s_mov_b32 s18, s33
5140; GFX9-NEXT:    s_mov_b32 s33, s32
5141; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
5142; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5143; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
5144; GFX9-NEXT:    s_addk_i32 s32, 0x400
5145; GFX9-NEXT:    s_getpc_b64 s[16:17]
5146; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5147; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5148; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
5149; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
5150; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
5151; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5152; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
5153; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5154; GFX9-NEXT:    s_waitcnt vmcnt(0)
5155; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5156; GFX9-NEXT:    s_waitcnt vmcnt(0)
5157; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5158; GFX9-NEXT:    s_waitcnt vmcnt(0)
5159; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5160; GFX9-NEXT:    s_waitcnt vmcnt(0)
5161; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5162; GFX9-NEXT:    s_waitcnt vmcnt(0)
5163; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5164; GFX9-NEXT:    s_waitcnt vmcnt(0)
5165; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5166; GFX9-NEXT:    s_waitcnt vmcnt(0)
5167; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
5168; GFX9-NEXT:    s_waitcnt vmcnt(0)
5169; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
5170; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
5171; GFX9-NEXT:    s_mov_b32 s32, s33
5172; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
5173; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5174; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
5175; GFX9-NEXT:    s_mov_b32 s33, s18
5176; GFX9-NEXT:    s_waitcnt vmcnt(0)
5177; GFX9-NEXT:    s_setpc_b64 s[30:31]
5178;
5179; GFX10-LABEL: test_call_v16bf16:
5180; GFX10:       ; %bb.0: ; %entry
5181; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5182; GFX10-NEXT:    s_mov_b32 s18, s33
5183; GFX10-NEXT:    s_mov_b32 s33, s32
5184; GFX10-NEXT:    s_xor_saveexec_b32 s16, -1
5185; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5186; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5187; GFX10-NEXT:    s_mov_b32 exec_lo, s16
5188; GFX10-NEXT:    s_addk_i32 s32, 0x200
5189; GFX10-NEXT:    s_getpc_b64 s[16:17]
5190; GFX10-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5191; GFX10-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5192; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
5193; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
5194; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
5195; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
5196; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
5197; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5198; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5199; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5200; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5201; GFX10-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5202; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5203; GFX10-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5204; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5205; GFX10-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5206; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5207; GFX10-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5208; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5209; GFX10-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5210; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5211; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
5212; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5213; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
5214; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
5215; GFX10-NEXT:    s_mov_b32 s32, s33
5216; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
5217; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5218; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5219; GFX10-NEXT:    s_mov_b32 exec_lo, s4
5220; GFX10-NEXT:    s_mov_b32 s33, s18
5221; GFX10-NEXT:    s_waitcnt vmcnt(0)
5222; GFX10-NEXT:    s_setpc_b64 s[30:31]
5223;
5224; GFX11-LABEL: test_call_v16bf16:
5225; GFX11:       ; %bb.0: ; %entry
5226; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5227; GFX11-NEXT:    s_mov_b32 s2, s33
5228; GFX11-NEXT:    s_mov_b32 s33, s32
5229; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
5230; GFX11-NEXT:    scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
5231; GFX11-NEXT:    s_mov_b32 exec_lo, s0
5232; GFX11-NEXT:    s_add_i32 s32, s32, 16
5233; GFX11-NEXT:    s_getpc_b64 s[0:1]
5234; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
5235; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
5236; GFX11-NEXT:    v_writelane_b32 v9, s30, 0
5237; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
5238; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
5239; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
5240; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
5241; GFX11-NEXT:    scratch_store_b128 v8, v[4:7], off offset:16 dlc
5242; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5243; GFX11-NEXT:    scratch_store_b128 v8, v[0:3], off dlc
5244; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5245; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
5246; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
5247; GFX11-NEXT:    s_mov_b32 s32, s33
5248; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
5249; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
5250; GFX11-NEXT:    s_mov_b32 exec_lo, s0
5251; GFX11-NEXT:    s_mov_b32 s33, s2
5252; GFX11-NEXT:    s_waitcnt vmcnt(0)
5253; GFX11-NEXT:    s_setpc_b64 s[30:31]
5254entry:
5255  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
5256  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
5257  ret void
5258}
5259
5260define bfloat @test_alloca_load_store_ret(bfloat %in) {
5261; GCN-LABEL: test_alloca_load_store_ret:
5262; GCN:       ; %bb.0: ; %entry
5263; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5264; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5265; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5266; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
5267; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5268; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
5269; GCN-NEXT:    s_waitcnt vmcnt(0)
5270; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5271; GCN-NEXT:    s_setpc_b64 s[30:31]
5272;
5273; GFX7-LABEL: test_alloca_load_store_ret:
5274; GFX7:       ; %bb.0: ; %entry
5275; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5276; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5277; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5278; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
5279; GFX7-NEXT:    s_waitcnt vmcnt(0)
5280; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
5281; GFX7-NEXT:    s_waitcnt vmcnt(0)
5282; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5283; GFX7-NEXT:    s_setpc_b64 s[30:31]
5284;
5285; GFX8-LABEL: test_alloca_load_store_ret:
5286; GFX8:       ; %bb.0: ; %entry
5287; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5288; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
5289; GFX8-NEXT:    s_waitcnt vmcnt(0)
5290; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
5291; GFX8-NEXT:    s_waitcnt vmcnt(0)
5292; GFX8-NEXT:    s_setpc_b64 s[30:31]
5293;
5294; GFX9-LABEL: test_alloca_load_store_ret:
5295; GFX9:       ; %bb.0: ; %entry
5296; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5297; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
5298; GFX9-NEXT:    s_waitcnt vmcnt(0)
5299; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
5300; GFX9-NEXT:    s_waitcnt vmcnt(0)
5301; GFX9-NEXT:    s_setpc_b64 s[30:31]
5302;
5303; GFX10-LABEL: test_alloca_load_store_ret:
5304; GFX10:       ; %bb.0: ; %entry
5305; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5306; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
5307; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5308; GFX10-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc dlc
5309; GFX10-NEXT:    s_waitcnt vmcnt(0)
5310; GFX10-NEXT:    s_setpc_b64 s[30:31]
5311;
5312; GFX11-LABEL: test_alloca_load_store_ret:
5313; GFX11:       ; %bb.0: ; %entry
5314; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5315; GFX11-NEXT:    scratch_store_b16 off, v0, s32 dlc
5316; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5317; GFX11-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
5318; GFX11-NEXT:    s_waitcnt vmcnt(0)
5319; GFX11-NEXT:    s_setpc_b64 s[30:31]
5320entry:
5321  %in.addr = alloca bfloat, align 2, addrspace(5)
5322  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
5323  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
5324  ret bfloat %loaded
5325}
5326
5327define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
5328; GCN-LABEL: test_overflow_stack:
5329; GCN:       ; %bb.0:
5330; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5331; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
5332; GCN-NEXT:    s_waitcnt expcnt(0)
5333; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
5334; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
5335; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
5336; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32
5337; GCN-NEXT:    s_waitcnt vmcnt(2)
5338; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5339; GCN-NEXT:    s_waitcnt expcnt(0)
5340; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
5341; GCN-NEXT:    s_waitcnt vmcnt(2)
5342; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
5343; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
5344; GCN-NEXT:    s_waitcnt vmcnt(2)
5345; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
5346; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
5347; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
5348; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
5349; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
5350; GCN-NEXT:    s_waitcnt expcnt(0)
5351; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
5352; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
5353; GCN-NEXT:    s_waitcnt expcnt(0)
5354; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x60, v0
5355; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x5c, v0
5356; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
5357; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
5358; GCN-NEXT:    s_waitcnt expcnt(0)
5359; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
5360; GCN-NEXT:    buffer_store_dword v27, v30, s[0:3], 0 offen
5361; GCN-NEXT:    s_waitcnt expcnt(0)
5362; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
5363; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
5364; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
5365; GCN-NEXT:    s_waitcnt expcnt(0)
5366; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
5367; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
5368; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
5369; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
5370; GCN-NEXT:    s_waitcnt expcnt(0)
5371; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
5372; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
5373; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
5374; GCN-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
5375; GCN-NEXT:    s_waitcnt expcnt(0)
5376; GCN-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
5377; GCN-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
5378; GCN-NEXT:    s_waitcnt expcnt(0)
5379; GCN-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
5380; GCN-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
5381; GCN-NEXT:    buffer_store_dword v22, v27, s[0:3], 0 offen
5382; GCN-NEXT:    s_waitcnt expcnt(0)
5383; GCN-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
5384; GCN-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
5385; GCN-NEXT:    buffer_store_dword v21, v30, s[0:3], 0 offen
5386; GCN-NEXT:    s_waitcnt expcnt(0)
5387; GCN-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
5388; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
5389; GCN-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
5390; GCN-NEXT:    s_waitcnt expcnt(0)
5391; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
5392; GCN-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
5393; GCN-NEXT:    buffer_store_dword v19, v29, s[0:3], 0 offen
5394; GCN-NEXT:    s_waitcnt expcnt(0)
5395; GCN-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
5396; GCN-NEXT:    v_add_i32_e32 v29, vcc, 12, v0
5397; GCN-NEXT:    buffer_store_dword v18, v25, s[0:3], 0 offen
5398; GCN-NEXT:    s_waitcnt expcnt(0)
5399; GCN-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
5400; GCN-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
5401; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
5402; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
5403; GCN-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
5404; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
5405; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
5406; GCN-NEXT:    buffer_store_dword v14, v23, s[0:3], 0 offen
5407; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
5408; GCN-NEXT:    buffer_store_dword v12, v22, s[0:3], 0 offen
5409; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
5410; GCN-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
5411; GCN-NEXT:    buffer_store_dword v9, v30, s[0:3], 0 offen
5412; GCN-NEXT:    buffer_store_dword v8, v20, s[0:3], 0 offen
5413; GCN-NEXT:    buffer_store_dword v7, v26, s[0:3], 0 offen
5414; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
5415; GCN-NEXT:    buffer_store_dword v5, v29, s[0:3], 0 offen
5416; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
5417; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
5418; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
5419; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5420; GCN-NEXT:    s_setpc_b64 s[30:31]
5421;
5422; GFX7-LABEL: test_overflow_stack:
5423; GFX7:       ; %bb.0:
5424; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5425; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
5426; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
5427; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
5428; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
5429; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
5430; GFX7-NEXT:    s_waitcnt vmcnt(0)
5431; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5432; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
5433; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
5434; GFX7-NEXT:    s_waitcnt vmcnt(0)
5435; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5436; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
5437; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x74, v0
5438; GFX7-NEXT:    s_waitcnt vmcnt(0)
5439; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5440; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
5441; GFX7-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
5442; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
5443; GFX7-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
5444; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
5445; GFX7-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
5446; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
5447; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
5448; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
5449; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
5450; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
5451; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
5452; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
5453; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
5454; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
5455; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
5456; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
5457; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
5458; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
5459; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
5460; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
5461; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
5462; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
5463; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
5464; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
5465; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
5466; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
5467; GFX7-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
5468; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
5469; GFX7-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
5470; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
5471; GFX7-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
5472; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
5473; GFX7-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
5474; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
5475; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
5476; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
5477; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
5478; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
5479; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
5480; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
5481; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
5482; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
5483; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
5484; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
5485; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
5486; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
5487; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
5488; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
5489; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
5490; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
5491; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
5492; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
5493; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
5494; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
5495; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
5496; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
5497; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
5498; GFX7-NEXT:    s_waitcnt vmcnt(0)
5499; GFX7-NEXT:    s_setpc_b64 s[30:31]
5500;
5501; GFX8-LABEL: test_overflow_stack:
5502; GFX8:       ; %bb.0:
5503; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5504; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
5505; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
5506; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
5507; GFX8-NEXT:    s_waitcnt vmcnt(0)
5508; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5509; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
5510; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x78, v0
5511; GFX8-NEXT:    s_waitcnt vmcnt(0)
5512; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5513; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
5514; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x74, v0
5515; GFX8-NEXT:    s_waitcnt vmcnt(0)
5516; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
5517; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
5518; GFX8-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
5519; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
5520; GFX8-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
5521; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
5522; GFX8-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
5523; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
5524; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
5525; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
5526; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
5527; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
5528; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
5529; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
5530; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
5531; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
5532; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
5533; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
5534; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
5535; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
5536; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
5537; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
5538; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
5539; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
5540; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
5541; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
5542; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
5543; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
5544; GFX8-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
5545; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
5546; GFX8-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
5547; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
5548; GFX8-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
5549; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
5550; GFX8-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
5551; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
5552; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
5553; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
5554; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
5555; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
5556; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
5557; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
5558; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
5559; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
5560; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
5561; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
5562; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
5563; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
5564; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
5565; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
5566; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
5567; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
5568; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
5569; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
5570; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
5571; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
5572; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
5573; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
5574; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
5575; GFX8-NEXT:    s_waitcnt vmcnt(0)
5576; GFX8-NEXT:    s_setpc_b64 s[30:31]
5577;
5578; GFX9-LABEL: test_overflow_stack:
5579; GFX9:       ; %bb.0:
5580; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5581; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5582; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5583; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5584; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5585; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5586; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
5587; GFX9-NEXT:    s_nop 0
5588; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:8
5589; GFX9-NEXT:    s_nop 0
5590; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5591; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32
5592; GFX9-NEXT:    s_nop 0
5593; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5594; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5595; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5596; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5597; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5598; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5599; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5600; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5601; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5602; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5603; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5604; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5605; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5606; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5607; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5608; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5609; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5610; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5611; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5612; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5613; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5614; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5615; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
5616; GFX9-NEXT:    s_waitcnt vmcnt(25)
5617; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
5618; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
5619; GFX9-NEXT:    s_waitcnt vmcnt(25)
5620; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
5621; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5622; GFX9-NEXT:    s_waitcnt vmcnt(0)
5623; GFX9-NEXT:    s_setpc_b64 s[30:31]
5624;
5625; GFX10-LABEL: test_overflow_stack:
5626; GFX10:       ; %bb.0:
5627; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5628; GFX10-NEXT:    s_clause 0x2
5629; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
5630; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
5631; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
5632; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5633; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5634; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5635; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5636; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5637; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5638; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5639; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5640; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5641; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5642; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5643; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5644; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5645; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5646; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5647; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5648; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5649; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5650; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5651; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5652; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5653; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5654; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5655; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5656; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5657; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5658; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5659; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5660; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
5661; GFX10-NEXT:    s_waitcnt vmcnt(2)
5662; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
5663; GFX10-NEXT:    s_waitcnt vmcnt(1)
5664; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
5665; GFX10-NEXT:    s_waitcnt vmcnt(0)
5666; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
5667; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5668; GFX10-NEXT:    s_setpc_b64 s[30:31]
5669;
5670; GFX11-LABEL: test_overflow_stack:
5671; GFX11:       ; %bb.0:
5672; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5673; GFX11-NEXT:    s_clause 0x2
5674; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
5675; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
5676; GFX11-NEXT:    scratch_load_b32 v31, off, s32
5677; GFX11-NEXT:    s_clause 0x5
5678; GFX11-NEXT:    scratch_store_b128 v0, v[22:25], off offset:80
5679; GFX11-NEXT:    scratch_store_b128 v0, v[18:21], off offset:64
5680; GFX11-NEXT:    scratch_store_b128 v0, v[14:17], off offset:48
5681; GFX11-NEXT:    scratch_store_b128 v0, v[10:13], off offset:32
5682; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
5683; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off
5684; GFX11-NEXT:    s_waitcnt vmcnt(0)
5685; GFX11-NEXT:    s_clause 0x2
5686; GFX11-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
5687; GFX11-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
5688; GFX11-NEXT:    scratch_store_b16 v0, v1, off offset:128
5689; GFX11-NEXT:    s_setpc_b64 s[30:31]
5690  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
5691  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
5692  ret { <32 x i32>, bfloat } %ins.1
5693}
5694
5695define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
5696; GCN-LABEL: global_extload_v2bf16_to_v2f32:
5697; GCN:       ; %bb.0:
5698; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5699; GCN-NEXT:    s_mov_b32 s6, 0
5700; GCN-NEXT:    s_mov_b32 s7, 0xf000
5701; GCN-NEXT:    s_mov_b32 s4, s6
5702; GCN-NEXT:    s_mov_b32 s5, s6
5703; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5704; GCN-NEXT:    s_waitcnt vmcnt(0)
5705; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5706; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5707; GCN-NEXT:    s_setpc_b64 s[30:31]
5708;
5709; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
5710; GFX7:       ; %bb.0:
5711; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5712; GFX7-NEXT:    s_mov_b32 s6, 0
5713; GFX7-NEXT:    s_mov_b32 s7, 0xf000
5714; GFX7-NEXT:    s_mov_b32 s4, s6
5715; GFX7-NEXT:    s_mov_b32 s5, s6
5716; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5717; GFX7-NEXT:    s_waitcnt vmcnt(0)
5718; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5719; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5720; GFX7-NEXT:    s_setpc_b64 s[30:31]
5721;
5722; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
5723; GFX8:       ; %bb.0:
5724; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5725; GFX8-NEXT:    flat_load_dword v1, v[0:1]
5726; GFX8-NEXT:    s_waitcnt vmcnt(0)
5727; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5728; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5729; GFX8-NEXT:    s_setpc_b64 s[30:31]
5730;
5731; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
5732; GFX9:       ; %bb.0:
5733; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5734; GFX9-NEXT:    global_load_dword v1, v[0:1], off
5735; GFX9-NEXT:    s_waitcnt vmcnt(0)
5736; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5737; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5738; GFX9-NEXT:    s_setpc_b64 s[30:31]
5739;
5740; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
5741; GFX10:       ; %bb.0:
5742; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5743; GFX10-NEXT:    global_load_dword v1, v[0:1], off
5744; GFX10-NEXT:    s_waitcnt vmcnt(0)
5745; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5746; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5747; GFX10-NEXT:    s_setpc_b64 s[30:31]
5748;
5749; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
5750; GFX11:       ; %bb.0:
5751; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5752; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
5753; GFX11-NEXT:    s_waitcnt vmcnt(0)
5754; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5755; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5756; GFX11-NEXT:    s_setpc_b64 s[30:31]
5757  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
5758  %fpext = fpext <2 x bfloat> %load to <2 x float>
5759  ret <2 x float> %fpext
5760}
5761
5762define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
5763; GCN-LABEL: global_extload_v3bf16_to_v3f32:
5764; GCN:       ; %bb.0:
5765; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5766; GCN-NEXT:    s_mov_b32 s6, 0
5767; GCN-NEXT:    s_mov_b32 s7, 0xf000
5768; GCN-NEXT:    s_mov_b32 s4, s6
5769; GCN-NEXT:    s_mov_b32 s5, s6
5770; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5771; GCN-NEXT:    s_waitcnt vmcnt(0)
5772; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5773; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5774; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5775; GCN-NEXT:    s_setpc_b64 s[30:31]
5776;
5777; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
5778; GFX7:       ; %bb.0:
5779; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5780; GFX7-NEXT:    s_mov_b32 s6, 0
5781; GFX7-NEXT:    s_mov_b32 s7, 0xf000
5782; GFX7-NEXT:    s_mov_b32 s4, s6
5783; GFX7-NEXT:    s_mov_b32 s5, s6
5784; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5785; GFX7-NEXT:    s_waitcnt vmcnt(0)
5786; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5787; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5788; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5789; GFX7-NEXT:    s_setpc_b64 s[30:31]
5790;
5791; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
5792; GFX8:       ; %bb.0:
5793; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5794; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
5795; GFX8-NEXT:    s_waitcnt vmcnt(0)
5796; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5797; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5798; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5799; GFX8-NEXT:    s_setpc_b64 s[30:31]
5800;
5801; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
5802; GFX9:       ; %bb.0:
5803; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5804; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
5805; GFX9-NEXT:    s_waitcnt vmcnt(0)
5806; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5807; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5808; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5809; GFX9-NEXT:    s_setpc_b64 s[30:31]
5810;
5811; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
5812; GFX10:       ; %bb.0:
5813; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5814; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
5815; GFX10-NEXT:    s_waitcnt vmcnt(0)
5816; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5817; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5818; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5819; GFX10-NEXT:    s_setpc_b64 s[30:31]
5820;
5821; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
5822; GFX11:       ; %bb.0:
5823; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5824; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
5825; GFX11-NEXT:    s_waitcnt vmcnt(0)
5826; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
5827; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5828; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
5829; GFX11-NEXT:    s_setpc_b64 s[30:31]
5830  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
5831  %fpext = fpext <3 x bfloat> %load to <3 x float>
5832  ret <3 x float> %fpext
5833}
5834
5835define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
5836; GCN-LABEL: global_extload_v4bf16_to_v4f32:
5837; GCN:       ; %bb.0:
5838; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5839; GCN-NEXT:    s_mov_b32 s6, 0
5840; GCN-NEXT:    s_mov_b32 s7, 0xf000
5841; GCN-NEXT:    s_mov_b32 s4, s6
5842; GCN-NEXT:    s_mov_b32 s5, s6
5843; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5844; GCN-NEXT:    s_waitcnt vmcnt(0)
5845; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5846; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5847; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5848; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5849; GCN-NEXT:    s_setpc_b64 s[30:31]
5850;
5851; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
5852; GFX7:       ; %bb.0:
5853; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5854; GFX7-NEXT:    s_mov_b32 s6, 0
5855; GFX7-NEXT:    s_mov_b32 s7, 0xf000
5856; GFX7-NEXT:    s_mov_b32 s4, s6
5857; GFX7-NEXT:    s_mov_b32 s5, s6
5858; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5859; GFX7-NEXT:    s_waitcnt vmcnt(0)
5860; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5861; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5862; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5863; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5864; GFX7-NEXT:    s_setpc_b64 s[30:31]
5865;
5866; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
5867; GFX8:       ; %bb.0:
5868; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5869; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5870; GFX8-NEXT:    s_waitcnt vmcnt(0)
5871; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5872; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5873; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5874; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5875; GFX8-NEXT:    s_setpc_b64 s[30:31]
5876;
5877; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
5878; GFX9:       ; %bb.0:
5879; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5880; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
5881; GFX9-NEXT:    s_waitcnt vmcnt(0)
5882; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5883; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5884; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5885; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5886; GFX9-NEXT:    s_setpc_b64 s[30:31]
5887;
5888; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
5889; GFX10:       ; %bb.0:
5890; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5891; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
5892; GFX10-NEXT:    s_waitcnt vmcnt(0)
5893; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5894; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5895; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5896; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5897; GFX10-NEXT:    s_setpc_b64 s[30:31]
5898;
5899; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
5900; GFX11:       ; %bb.0:
5901; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5902; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
5903; GFX11-NEXT:    s_waitcnt vmcnt(0)
5904; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5905; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5906; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5907; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5908; GFX11-NEXT:    s_setpc_b64 s[30:31]
5909  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
5910  %fpext = fpext <4 x bfloat> %load to <4 x float>
5911  ret <4 x float> %fpext
5912}
5913
5914define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
5915; GCN-LABEL: global_extload_v5bf16_to_v5f32:
5916; GCN:       ; %bb.0:
5917; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5918; GCN-NEXT:    s_mov_b32 s6, 0
5919; GCN-NEXT:    s_mov_b32 s7, 0xf000
5920; GCN-NEXT:    s_mov_b32 s4, s6
5921; GCN-NEXT:    s_mov_b32 s5, s6
5922; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5923; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5924; GCN-NEXT:    s_waitcnt vmcnt(1)
5925; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5926; GCN-NEXT:    s_waitcnt vmcnt(0)
5927; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5928; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5929; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5930; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5931; GCN-NEXT:    s_setpc_b64 s[30:31]
5932;
5933; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
5934; GFX7:       ; %bb.0:
5935; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5936; GFX7-NEXT:    s_mov_b32 s6, 0
5937; GFX7-NEXT:    s_mov_b32 s7, 0xf000
5938; GFX7-NEXT:    s_mov_b32 s4, s6
5939; GFX7-NEXT:    s_mov_b32 s5, s6
5940; GFX7-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5941; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5942; GFX7-NEXT:    s_waitcnt vmcnt(1)
5943; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5944; GFX7-NEXT:    s_waitcnt vmcnt(0)
5945; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5946; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5947; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5948; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5949; GFX7-NEXT:    s_setpc_b64 s[30:31]
5950;
5951; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
5952; GFX8:       ; %bb.0:
5953; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5954; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
5955; GFX8-NEXT:    s_waitcnt vmcnt(0)
5956; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5957; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5958; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5959; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5960; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5961; GFX8-NEXT:    s_setpc_b64 s[30:31]
5962;
5963; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
5964; GFX9:       ; %bb.0:
5965; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5966; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
5967; GFX9-NEXT:    s_waitcnt vmcnt(0)
5968; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5969; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5970; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5971; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5972; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5973; GFX9-NEXT:    s_setpc_b64 s[30:31]
5974;
5975; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
5976; GFX10:       ; %bb.0:
5977; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5978; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
5979; GFX10-NEXT:    s_waitcnt vmcnt(0)
5980; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5981; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5982; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5983; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5984; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5985; GFX10-NEXT:    s_setpc_b64 s[30:31]
5986;
5987; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
5988; GFX11:       ; %bb.0:
5989; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5990; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off
5991; GFX11-NEXT:    s_waitcnt vmcnt(0)
5992; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
5993; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
5994; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
5995; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5996; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5997; GFX11-NEXT:    s_setpc_b64 s[30:31]
5998  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
5999  %fpext = fpext <5 x bfloat> %load to <5 x float>
6000  ret <5 x float> %fpext
6001}
6002
6003define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
6004; GCN-LABEL: global_extload_v6bf16_to_v6f32:
6005; GCN:       ; %bb.0:
6006; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6007; GCN-NEXT:    s_mov_b32 s6, 0
6008; GCN-NEXT:    s_mov_b32 s7, 0xf000
6009; GCN-NEXT:    s_mov_b32 s4, s6
6010; GCN-NEXT:    s_mov_b32 s5, s6
6011; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
6012; GCN-NEXT:    s_waitcnt vmcnt(0)
6013; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6014; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6015; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6016; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6017; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6018; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6019; GCN-NEXT:    s_setpc_b64 s[30:31]
6020;
6021; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
6022; GFX7:       ; %bb.0:
6023; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6024; GFX7-NEXT:    s_mov_b32 s6, 0
6025; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6026; GFX7-NEXT:    s_mov_b32 s4, s6
6027; GFX7-NEXT:    s_mov_b32 s5, s6
6028; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
6029; GFX7-NEXT:    s_waitcnt vmcnt(0)
6030; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6031; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6032; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6033; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6034; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6035; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6036; GFX7-NEXT:    s_setpc_b64 s[30:31]
6037;
6038; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
6039; GFX8:       ; %bb.0:
6040; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6041; GFX8-NEXT:    flat_load_dwordx3 v[3:5], v[0:1]
6042; GFX8-NEXT:    s_waitcnt vmcnt(0)
6043; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6044; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6045; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6046; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6047; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6048; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6049; GFX8-NEXT:    s_setpc_b64 s[30:31]
6050;
6051; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
6052; GFX9:       ; %bb.0:
6053; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6054; GFX9-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
6055; GFX9-NEXT:    s_waitcnt vmcnt(0)
6056; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6057; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6058; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6059; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6060; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6061; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6062; GFX9-NEXT:    s_setpc_b64 s[30:31]
6063;
6064; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
6065; GFX10:       ; %bb.0:
6066; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6067; GFX10-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
6068; GFX10-NEXT:    s_waitcnt vmcnt(0)
6069; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6070; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6071; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6072; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6073; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6074; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6075; GFX10-NEXT:    s_setpc_b64 s[30:31]
6076;
6077; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
6078; GFX11:       ; %bb.0:
6079; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6080; GFX11-NEXT:    global_load_b96 v[3:5], v[0:1], off
6081; GFX11-NEXT:    s_waitcnt vmcnt(0)
6082; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
6083; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
6084; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
6085; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
6086; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
6087; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
6088; GFX11-NEXT:    s_setpc_b64 s[30:31]
6089  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
6090  %fpext = fpext <6 x bfloat> %load to <6 x float>
6091  ret <6 x float> %fpext
6092}
6093
6094define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
6095; GCN-LABEL: global_extload_v8bf16_to_v8f32:
6096; GCN:       ; %bb.0:
6097; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6098; GCN-NEXT:    s_mov_b32 s6, 0
6099; GCN-NEXT:    s_mov_b32 s7, 0xf000
6100; GCN-NEXT:    s_mov_b32 s4, s6
6101; GCN-NEXT:    s_mov_b32 s5, s6
6102; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6103; GCN-NEXT:    s_waitcnt vmcnt(0)
6104; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6105; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6106; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6107; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6108; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6109; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6110; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6111; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6112; GCN-NEXT:    s_setpc_b64 s[30:31]
6113;
6114; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
6115; GFX7:       ; %bb.0:
6116; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6117; GFX7-NEXT:    s_mov_b32 s6, 0
6118; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6119; GFX7-NEXT:    s_mov_b32 s4, s6
6120; GFX7-NEXT:    s_mov_b32 s5, s6
6121; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6122; GFX7-NEXT:    s_waitcnt vmcnt(0)
6123; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6124; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6125; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6126; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6127; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6128; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6129; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6130; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6131; GFX7-NEXT:    s_setpc_b64 s[30:31]
6132;
6133; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
6134; GFX8:       ; %bb.0:
6135; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6136; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
6137; GFX8-NEXT:    s_waitcnt vmcnt(0)
6138; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6139; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6140; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6141; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6142; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6143; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6144; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6145; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6146; GFX8-NEXT:    s_setpc_b64 s[30:31]
6147;
6148; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
6149; GFX9:       ; %bb.0:
6150; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6151; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6152; GFX9-NEXT:    s_waitcnt vmcnt(0)
6153; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6154; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6155; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6156; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6157; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6158; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6159; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6160; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6161; GFX9-NEXT:    s_setpc_b64 s[30:31]
6162;
6163; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
6164; GFX10:       ; %bb.0:
6165; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6166; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6167; GFX10-NEXT:    s_waitcnt vmcnt(0)
6168; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6169; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6170; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6171; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6172; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6173; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6174; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6175; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6176; GFX10-NEXT:    s_setpc_b64 s[30:31]
6177;
6178; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
6179; GFX11:       ; %bb.0:
6180; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6181; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
6182; GFX11-NEXT:    s_waitcnt vmcnt(0)
6183; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6184; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6185; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6186; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6187; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6188; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6189; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6190; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6191; GFX11-NEXT:    s_setpc_b64 s[30:31]
6192  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
6193  %fpext = fpext <8 x bfloat> %load to <8 x float>
6194  ret <8 x float> %fpext
6195}
6196
6197define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
6198; GCN-LABEL: global_extload_v16bf16_to_v16f32:
6199; GCN:       ; %bb.0:
6200; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6201; GCN-NEXT:    s_mov_b32 s6, 0
6202; GCN-NEXT:    s_mov_b32 s7, 0xf000
6203; GCN-NEXT:    s_mov_b32 s4, s6
6204; GCN-NEXT:    s_mov_b32 s5, s6
6205; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6206; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6207; GCN-NEXT:    s_waitcnt vmcnt(1)
6208; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6209; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6210; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6211; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6212; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6213; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6214; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6215; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6216; GCN-NEXT:    s_waitcnt vmcnt(0)
6217; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6218; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6219; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6220; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6221; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6222; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6223; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6224; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6225; GCN-NEXT:    s_setpc_b64 s[30:31]
6226;
6227; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
6228; GFX7:       ; %bb.0:
6229; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6230; GFX7-NEXT:    s_mov_b32 s6, 0
6231; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6232; GFX7-NEXT:    s_mov_b32 s4, s6
6233; GFX7-NEXT:    s_mov_b32 s5, s6
6234; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6235; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6236; GFX7-NEXT:    s_waitcnt vmcnt(1)
6237; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6238; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6239; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6240; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6241; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6242; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6243; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6244; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6245; GFX7-NEXT:    s_waitcnt vmcnt(0)
6246; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6247; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6248; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6249; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6250; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6251; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6252; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6253; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6254; GFX7-NEXT:    s_setpc_b64 s[30:31]
6255;
6256; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
6257; GFX8:       ; %bb.0:
6258; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6259; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
6260; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
6261; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6262; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
6263; GFX8-NEXT:    s_waitcnt vmcnt(1)
6264; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6265; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6266; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6267; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6268; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6269; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6270; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6271; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6272; GFX8-NEXT:    s_waitcnt vmcnt(0)
6273; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6274; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6275; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6276; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6277; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6278; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6279; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6280; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6281; GFX8-NEXT:    s_setpc_b64 s[30:31]
6282;
6283; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
6284; GFX9:       ; %bb.0:
6285; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6286; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6287; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
6288; GFX9-NEXT:    s_waitcnt vmcnt(1)
6289; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6290; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6291; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6292; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6293; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6294; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6295; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6296; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6297; GFX9-NEXT:    s_waitcnt vmcnt(0)
6298; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6299; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6300; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6301; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6302; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6303; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6304; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6305; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6306; GFX9-NEXT:    s_setpc_b64 s[30:31]
6307;
6308; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
6309; GFX10:       ; %bb.0:
6310; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6311; GFX10-NEXT:    s_clause 0x1
6312; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6313; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
6314; GFX10-NEXT:    s_waitcnt vmcnt(1)
6315; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6316; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6317; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6318; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6319; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6320; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6321; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6322; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6323; GFX10-NEXT:    s_waitcnt vmcnt(0)
6324; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6325; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6326; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6327; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6328; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6329; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6330; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6331; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6332; GFX10-NEXT:    s_setpc_b64 s[30:31]
6333;
6334; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
6335; GFX11:       ; %bb.0:
6336; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6337; GFX11-NEXT:    s_clause 0x1
6338; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
6339; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
6340; GFX11-NEXT:    s_waitcnt vmcnt(1)
6341; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6342; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6343; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6344; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6345; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6346; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6347; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6348; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6349; GFX11-NEXT:    s_waitcnt vmcnt(0)
6350; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6351; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6352; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6353; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6354; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6355; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6356; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6357; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6358; GFX11-NEXT:    s_setpc_b64 s[30:31]
6359  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
6360  %fpext = fpext <16 x bfloat> %load to <16 x float>
6361  ret <16 x float> %fpext
6362}
6363
6364define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
6365; GCN-LABEL: global_extload_v32bf16_to_v32f32:
6366; GCN:       ; %bb.0:
6367; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6368; GCN-NEXT:    s_mov_b32 s6, 0
6369; GCN-NEXT:    s_mov_b32 s7, 0xf000
6370; GCN-NEXT:    s_mov_b32 s4, s6
6371; GCN-NEXT:    s_mov_b32 s5, s6
6372; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6373; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6374; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6375; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6376; GCN-NEXT:    s_waitcnt vmcnt(3)
6377; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6378; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6379; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6380; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6381; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6382; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6383; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6384; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6385; GCN-NEXT:    s_waitcnt vmcnt(2)
6386; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6387; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6388; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6389; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6390; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6391; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6392; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6393; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6394; GCN-NEXT:    s_waitcnt vmcnt(1)
6395; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6396; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6397; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6398; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6399; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6400; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6401; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6402; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6403; GCN-NEXT:    s_waitcnt vmcnt(0)
6404; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6405; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6406; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6407; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6408; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6409; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6410; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6411; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6412; GCN-NEXT:    s_setpc_b64 s[30:31]
6413;
6414; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
6415; GFX7:       ; %bb.0:
6416; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6417; GFX7-NEXT:    s_mov_b32 s6, 0
6418; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6419; GFX7-NEXT:    s_mov_b32 s4, s6
6420; GFX7-NEXT:    s_mov_b32 s5, s6
6421; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6422; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6423; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6424; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6425; GFX7-NEXT:    s_waitcnt vmcnt(3)
6426; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6427; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6428; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6429; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6430; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6431; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6432; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6433; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6434; GFX7-NEXT:    s_waitcnt vmcnt(2)
6435; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6436; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6437; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6438; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6439; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6440; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6441; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6442; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6443; GFX7-NEXT:    s_waitcnt vmcnt(1)
6444; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6445; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6446; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6447; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6448; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6449; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6450; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6451; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6452; GFX7-NEXT:    s_waitcnt vmcnt(0)
6453; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6454; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6455; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6456; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6457; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6458; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6459; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6460; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6461; GFX7-NEXT:    s_setpc_b64 s[30:31]
6462;
6463; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
6464; GFX8:       ; %bb.0:
6465; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6466; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
6467; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
6468; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
6469; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[2:3]
6470; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
6471; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
6472; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
6473; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6474; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[2:3]
6475; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
6476; GFX8-NEXT:    s_waitcnt vmcnt(3)
6477; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6478; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6479; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6480; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6481; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6482; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6483; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6484; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6485; GFX8-NEXT:    s_waitcnt vmcnt(2)
6486; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6487; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6488; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6489; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6490; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6491; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6492; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6493; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6494; GFX8-NEXT:    s_waitcnt vmcnt(1)
6495; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6496; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6497; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6498; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6499; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6500; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6501; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6502; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6503; GFX8-NEXT:    s_waitcnt vmcnt(0)
6504; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6505; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6506; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6507; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6508; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6509; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6510; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6511; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6512; GFX8-NEXT:    s_setpc_b64 s[30:31]
6513;
6514; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
6515; GFX9:       ; %bb.0:
6516; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6517; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6518; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
6519; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
6520; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
6521; GFX9-NEXT:    s_waitcnt vmcnt(3)
6522; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6523; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6524; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6525; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6526; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6527; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6528; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6529; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6530; GFX9-NEXT:    s_waitcnt vmcnt(2)
6531; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6532; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6533; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6534; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6535; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6536; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6537; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6538; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6539; GFX9-NEXT:    s_waitcnt vmcnt(1)
6540; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6541; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6542; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6543; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6544; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6545; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6546; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6547; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6548; GFX9-NEXT:    s_waitcnt vmcnt(0)
6549; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6550; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6551; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6552; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6553; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6554; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6555; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6556; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6557; GFX9-NEXT:    s_setpc_b64 s[30:31]
6558;
6559; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
6560; GFX10:       ; %bb.0:
6561; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6562; GFX10-NEXT:    s_clause 0x3
6563; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
6564; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
6565; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
6566; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
6567; GFX10-NEXT:    s_waitcnt vmcnt(3)
6568; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6569; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6570; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6571; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6572; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6573; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6574; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6575; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6576; GFX10-NEXT:    s_waitcnt vmcnt(2)
6577; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6578; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6579; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6580; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6581; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6582; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6583; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6584; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6585; GFX10-NEXT:    s_waitcnt vmcnt(1)
6586; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6587; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6588; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6589; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6590; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6591; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6592; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6593; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6594; GFX10-NEXT:    s_waitcnt vmcnt(0)
6595; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6596; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6597; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6598; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6599; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6600; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6601; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6602; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6603; GFX10-NEXT:    s_setpc_b64 s[30:31]
6604;
6605; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
6606; GFX11:       ; %bb.0:
6607; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6608; GFX11-NEXT:    s_clause 0x3
6609; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
6610; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
6611; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
6612; GFX11-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:48
6613; GFX11-NEXT:    s_waitcnt vmcnt(3)
6614; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6615; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6616; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
6617; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
6618; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
6619; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
6620; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
6621; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
6622; GFX11-NEXT:    s_waitcnt vmcnt(2)
6623; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
6624; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
6625; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
6626; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
6627; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
6628; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
6629; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
6630; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
6631; GFX11-NEXT:    s_waitcnt vmcnt(1)
6632; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
6633; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
6634; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
6635; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
6636; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
6637; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
6638; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
6639; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
6640; GFX11-NEXT:    s_waitcnt vmcnt(0)
6641; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
6642; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
6643; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
6644; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
6645; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
6646; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
6647; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
6648; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
6649; GFX11-NEXT:    s_setpc_b64 s[30:31]
6650  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
6651  %fpext = fpext <32 x bfloat> %load to <32 x float>
6652  ret <32 x float> %fpext
6653}
6654
6655define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
6656; GCN-LABEL: global_extload_v2bf16_to_v2f64:
6657; GCN:       ; %bb.0:
6658; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6659; GCN-NEXT:    s_mov_b32 s6, 0
6660; GCN-NEXT:    s_mov_b32 s7, 0xf000
6661; GCN-NEXT:    s_mov_b32 s4, s6
6662; GCN-NEXT:    s_mov_b32 s5, s6
6663; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
6664; GCN-NEXT:    s_waitcnt vmcnt(0)
6665; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
6666; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
6667; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
6668; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6669; GCN-NEXT:    s_setpc_b64 s[30:31]
6670;
6671; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
6672; GFX7:       ; %bb.0:
6673; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6674; GFX7-NEXT:    s_mov_b32 s6, 0
6675; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6676; GFX7-NEXT:    s_mov_b32 s4, s6
6677; GFX7-NEXT:    s_mov_b32 s5, s6
6678; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
6679; GFX7-NEXT:    s_waitcnt vmcnt(0)
6680; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
6681; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
6682; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6683; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6684; GFX7-NEXT:    s_setpc_b64 s[30:31]
6685;
6686; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
6687; GFX8:       ; %bb.0:
6688; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6689; GFX8-NEXT:    flat_load_dword v2, v[0:1]
6690; GFX8-NEXT:    s_waitcnt vmcnt(0)
6691; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
6692; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
6693; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6694; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6695; GFX8-NEXT:    s_setpc_b64 s[30:31]
6696;
6697; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
6698; GFX9:       ; %bb.0:
6699; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6700; GFX9-NEXT:    global_load_dword v2, v[0:1], off
6701; GFX9-NEXT:    s_waitcnt vmcnt(0)
6702; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
6703; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
6704; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6705; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6706; GFX9-NEXT:    s_setpc_b64 s[30:31]
6707;
6708; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
6709; GFX10:       ; %bb.0:
6710; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6711; GFX10-NEXT:    global_load_dword v0, v[0:1], off
6712; GFX10-NEXT:    s_waitcnt vmcnt(0)
6713; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
6714; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
6715; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
6716; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6717; GFX10-NEXT:    s_setpc_b64 s[30:31]
6718;
6719; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
6720; GFX11:       ; %bb.0:
6721; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6722; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
6723; GFX11-NEXT:    s_waitcnt vmcnt(0)
6724; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
6725; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
6726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6727; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
6728; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
6729; GFX11-NEXT:    s_setpc_b64 s[30:31]
6730  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
6731  %fpext = fpext <2 x bfloat> %load to <2 x double>
6732  ret <2 x double> %fpext
6733}
6734
6735define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
6736; GCN-LABEL: global_extload_v3bf16_to_v3f64:
6737; GCN:       ; %bb.0:
6738; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6739; GCN-NEXT:    s_mov_b32 s6, 0
6740; GCN-NEXT:    s_mov_b32 s7, 0xf000
6741; GCN-NEXT:    s_mov_b32 s4, s6
6742; GCN-NEXT:    s_mov_b32 s5, s6
6743; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6744; GCN-NEXT:    s_waitcnt vmcnt(0)
6745; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6746; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6747; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6748; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6749; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6750; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6751; GCN-NEXT:    s_setpc_b64 s[30:31]
6752;
6753; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
6754; GFX7:       ; %bb.0:
6755; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6756; GFX7-NEXT:    s_mov_b32 s6, 0
6757; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6758; GFX7-NEXT:    s_mov_b32 s4, s6
6759; GFX7-NEXT:    s_mov_b32 s5, s6
6760; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
6761; GFX7-NEXT:    s_waitcnt vmcnt(0)
6762; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
6763; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6764; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
6765; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6766; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6767; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6768; GFX7-NEXT:    s_setpc_b64 s[30:31]
6769;
6770; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
6771; GFX8:       ; %bb.0:
6772; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6773; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
6774; GFX8-NEXT:    s_waitcnt vmcnt(0)
6775; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
6776; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6777; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
6778; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6779; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6780; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6781; GFX8-NEXT:    s_setpc_b64 s[30:31]
6782;
6783; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
6784; GFX9:       ; %bb.0:
6785; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6786; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
6787; GFX9-NEXT:    s_waitcnt vmcnt(0)
6788; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
6789; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6790; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
6791; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6792; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6793; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6794; GFX9-NEXT:    s_setpc_b64 s[30:31]
6795;
6796; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
6797; GFX10:       ; %bb.0:
6798; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6799; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
6800; GFX10-NEXT:    s_waitcnt vmcnt(0)
6801; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6802; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6803; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6804; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6805; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6806; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6807; GFX10-NEXT:    s_setpc_b64 s[30:31]
6808;
6809; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
6810; GFX11:       ; %bb.0:
6811; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6812; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
6813; GFX11-NEXT:    s_waitcnt vmcnt(0)
6814; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6815; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6816; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6817; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6818; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6819; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6820; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
6821; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6822; GFX11-NEXT:    s_setpc_b64 s[30:31]
6823  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
6824  %fpext = fpext <3 x bfloat> %load to <3 x double>
6825  ret <3 x double> %fpext
6826}
6827
6828define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
6829; GCN-LABEL: global_extload_v4bf16_to_v4f64:
6830; GCN:       ; %bb.0:
6831; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6832; GCN-NEXT:    s_mov_b32 s6, 0
6833; GCN-NEXT:    s_mov_b32 s7, 0xf000
6834; GCN-NEXT:    s_mov_b32 s4, s6
6835; GCN-NEXT:    s_mov_b32 s5, s6
6836; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6837; GCN-NEXT:    s_waitcnt vmcnt(0)
6838; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6839; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6840; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6841; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6842; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6843; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6844; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6845; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6846; GCN-NEXT:    s_setpc_b64 s[30:31]
6847;
6848; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
6849; GFX7:       ; %bb.0:
6850; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6851; GFX7-NEXT:    s_mov_b32 s6, 0
6852; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6853; GFX7-NEXT:    s_mov_b32 s4, s6
6854; GFX7-NEXT:    s_mov_b32 s5, s6
6855; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6856; GFX7-NEXT:    s_waitcnt vmcnt(0)
6857; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6858; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6859; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6860; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6861; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6862; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6863; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6864; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6865; GFX7-NEXT:    s_setpc_b64 s[30:31]
6866;
6867; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
6868; GFX8:       ; %bb.0:
6869; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6870; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
6871; GFX8-NEXT:    s_waitcnt vmcnt(0)
6872; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6873; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6874; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6875; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6876; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6877; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6878; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6879; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6880; GFX8-NEXT:    s_setpc_b64 s[30:31]
6881;
6882; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
6883; GFX9:       ; %bb.0:
6884; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6885; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
6886; GFX9-NEXT:    s_waitcnt vmcnt(0)
6887; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
6888; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6889; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
6890; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6891; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
6892; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6893; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6894; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6895; GFX9-NEXT:    s_setpc_b64 s[30:31]
6896;
6897; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
6898; GFX10:       ; %bb.0:
6899; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6900; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
6901; GFX10-NEXT:    s_waitcnt vmcnt(0)
6902; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
6903; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6904; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
6905; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
6906; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6907; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6908; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6909; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6910; GFX10-NEXT:    s_setpc_b64 s[30:31]
6911;
6912; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
6913; GFX11:       ; %bb.0:
6914; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6915; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
6916; GFX11-NEXT:    s_waitcnt vmcnt(0)
6917; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
6918; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6919; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
6920; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
6921; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6922; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
6923; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
6924; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6925; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
6926; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6927; GFX11-NEXT:    s_setpc_b64 s[30:31]
6928  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
6929  %fpext = fpext <4 x bfloat> %load to <4 x double>
6930  ret <4 x double> %fpext
6931}
6932
6933define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
6934; GCN-LABEL: global_extload_v5bf16_to_v5f64:
6935; GCN:       ; %bb.0:
6936; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6937; GCN-NEXT:    s_mov_b32 s6, 0
6938; GCN-NEXT:    s_mov_b32 s7, 0xf000
6939; GCN-NEXT:    s_mov_b32 s4, s6
6940; GCN-NEXT:    s_mov_b32 s5, s6
6941; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6942; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6943; GCN-NEXT:    s_waitcnt vmcnt(1)
6944; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
6945; GCN-NEXT:    s_waitcnt vmcnt(0)
6946; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
6947; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
6948; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
6949; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6950; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
6951; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
6952; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
6953; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
6954; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6955; GCN-NEXT:    s_setpc_b64 s[30:31]
6956;
6957; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
6958; GFX7:       ; %bb.0:
6959; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6960; GFX7-NEXT:    s_mov_b32 s6, 0
6961; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6962; GFX7-NEXT:    s_mov_b32 s4, s6
6963; GFX7-NEXT:    s_mov_b32 s5, s6
6964; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6965; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6966; GFX7-NEXT:    s_waitcnt vmcnt(1)
6967; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
6968; GFX7-NEXT:    s_waitcnt vmcnt(0)
6969; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
6970; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
6971; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
6972; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6973; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
6974; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
6975; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
6976; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
6977; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6978; GFX7-NEXT:    s_setpc_b64 s[30:31]
6979;
6980; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
6981; GFX8:       ; %bb.0:
6982; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6983; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
6984; GFX8-NEXT:    s_waitcnt vmcnt(0)
6985; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
6986; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
6987; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
6988; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
6989; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
6990; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
6991; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
6992; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
6993; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
6994; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
6995; GFX8-NEXT:    s_setpc_b64 s[30:31]
6996;
6997; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
6998; GFX9:       ; %bb.0:
6999; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7000; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
7001; GFX9-NEXT:    s_waitcnt vmcnt(0)
7002; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
7003; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
7004; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
7005; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
7006; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7007; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
7008; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
7009; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7010; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7011; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7012; GFX9-NEXT:    s_setpc_b64 s[30:31]
7013;
7014; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
7015; GFX10:       ; %bb.0:
7016; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7017; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
7018; GFX10-NEXT:    s_waitcnt vmcnt(0)
7019; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7020; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7021; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
7022; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
7023; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
7024; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7025; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7026; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7027; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7028; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7029; GFX10-NEXT:    s_setpc_b64 s[30:31]
7030;
7031; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
7032; GFX11:       ; %bb.0:
7033; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7034; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off
7035; GFX11-NEXT:    s_waitcnt vmcnt(0)
7036; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7037; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7038; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
7039; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
7040; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
7041; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7042; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7043; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7044; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7045; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7046; GFX11-NEXT:    s_setpc_b64 s[30:31]
7047  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
7048  %fpext = fpext <5 x bfloat> %load to <5 x double>
7049  ret <5 x double> %fpext
7050}
7051
7052define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
7053; GCN-LABEL: global_extload_v6bf16_to_v6f64:
7054; GCN:       ; %bb.0:
7055; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7056; GCN-NEXT:    s_mov_b32 s6, 0
7057; GCN-NEXT:    s_mov_b32 s7, 0xf000
7058; GCN-NEXT:    s_mov_b32 s4, s6
7059; GCN-NEXT:    s_mov_b32 s5, s6
7060; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7061; GCN-NEXT:    s_waitcnt vmcnt(0)
7062; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
7063; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
7064; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
7065; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
7066; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7067; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7068; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
7069; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
7070; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7071; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7072; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7073; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7074; GCN-NEXT:    s_setpc_b64 s[30:31]
7075;
7076; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
7077; GFX7:       ; %bb.0:
7078; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7079; GFX7-NEXT:    s_mov_b32 s6, 0
7080; GFX7-NEXT:    s_mov_b32 s7, 0xf000
7081; GFX7-NEXT:    s_mov_b32 s4, s6
7082; GFX7-NEXT:    s_mov_b32 s5, s6
7083; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
7084; GFX7-NEXT:    s_waitcnt vmcnt(0)
7085; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
7086; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
7087; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
7088; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
7089; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7090; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7091; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
7092; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
7093; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7094; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7095; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7096; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7097; GFX7-NEXT:    s_setpc_b64 s[30:31]
7098;
7099; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
7100; GFX8:       ; %bb.0:
7101; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7102; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
7103; GFX8-NEXT:    s_waitcnt vmcnt(0)
7104; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
7105; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
7106; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
7107; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
7108; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7109; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7110; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
7111; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
7112; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7113; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7114; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7115; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7116; GFX8-NEXT:    s_setpc_b64 s[30:31]
7117;
7118; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
7119; GFX9:       ; %bb.0:
7120; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7121; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
7122; GFX9-NEXT:    s_waitcnt vmcnt(0)
7123; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
7124; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
7125; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
7126; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
7127; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7128; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7129; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
7130; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
7131; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
7132; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7133; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7134; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7135; GFX9-NEXT:    s_setpc_b64 s[30:31]
7136;
7137; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
7138; GFX10:       ; %bb.0:
7139; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7140; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
7141; GFX10-NEXT:    s_waitcnt vmcnt(0)
7142; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
7143; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
7144; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
7145; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
7146; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
7147; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
7148; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7149; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7150; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
7151; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7152; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7153; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7154; GFX10-NEXT:    s_setpc_b64 s[30:31]
7155;
7156; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
7157; GFX11:       ; %bb.0:
7158; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7159; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
7160; GFX11-NEXT:    s_waitcnt vmcnt(0)
7161; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
7162; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
7163; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
7164; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
7165; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
7166; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
7167; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7168; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7169; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
7170; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7171; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7172; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7173; GFX11-NEXT:    s_setpc_b64 s[30:31]
7174  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
7175  %fpext = fpext <6 x bfloat> %load to <6 x double>
7176  ret <6 x double> %fpext
7177}
7178
7179define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
7180; GCN-LABEL: global_extload_v8bf16_to_v8f64:
7181; GCN:       ; %bb.0:
7182; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7183; GCN-NEXT:    s_mov_b32 s6, 0
7184; GCN-NEXT:    s_mov_b32 s7, 0xf000
7185; GCN-NEXT:    s_mov_b32 s4, s6
7186; GCN-NEXT:    s_mov_b32 s5, s6
7187; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7188; GCN-NEXT:    s_waitcnt vmcnt(0)
7189; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
7190; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
7191; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
7192; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
7193; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7194; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7195; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
7196; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
7197; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
7198; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
7199; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
7200; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7201; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7202; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7203; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7204; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7205; GCN-NEXT:    s_setpc_b64 s[30:31]
7206;
7207; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
7208; GFX7:       ; %bb.0:
7209; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7210; GFX7-NEXT:    s_mov_b32 s6, 0
7211; GFX7-NEXT:    s_mov_b32 s7, 0xf000
7212; GFX7-NEXT:    s_mov_b32 s4, s6
7213; GFX7-NEXT:    s_mov_b32 s5, s6
7214; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7215; GFX7-NEXT:    s_waitcnt vmcnt(0)
7216; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
7217; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
7218; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
7219; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
7220; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7221; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7222; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
7223; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
7224; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
7225; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
7226; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
7227; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7228; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7229; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7230; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7231; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7232; GFX7-NEXT:    s_setpc_b64 s[30:31]
7233;
7234; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
7235; GFX8:       ; %bb.0:
7236; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7237; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
7238; GFX8-NEXT:    s_waitcnt vmcnt(0)
7239; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
7240; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
7241; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
7242; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
7243; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7244; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7245; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
7246; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
7247; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
7248; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
7249; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
7250; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7251; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7252; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7253; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7254; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7255; GFX8-NEXT:    s_setpc_b64 s[30:31]
7256;
7257; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
7258; GFX9:       ; %bb.0:
7259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7260; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
7261; GFX9-NEXT:    s_waitcnt vmcnt(0)
7262; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
7263; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
7264; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
7265; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
7266; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
7267; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
7268; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
7269; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
7270; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
7271; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
7272; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
7273; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7274; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7275; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
7276; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7277; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7278; GFX9-NEXT:    s_setpc_b64 s[30:31]
7279;
7280; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
7281; GFX10:       ; %bb.0:
7282; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7283; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
7284; GFX10-NEXT:    s_waitcnt vmcnt(0)
7285; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
7286; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
7287; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
7288; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
7289; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
7290; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
7291; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
7292; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
7293; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7294; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7295; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
7296; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7297; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7298; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
7299; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7300; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7301; GFX10-NEXT:    s_setpc_b64 s[30:31]
7302;
7303; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
7304; GFX11:       ; %bb.0:
7305; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7306; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
7307; GFX11-NEXT:    s_waitcnt vmcnt(0)
7308; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
7309; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
7310; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
7311; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
7312; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
7313; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
7314; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
7315; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
7316; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7317; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7318; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
7319; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7320; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7321; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
7322; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7323; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7324; GFX11-NEXT:    s_setpc_b64 s[30:31]
7325  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
7326  %fpext = fpext <8 x bfloat> %load to <8 x double>
7327  ret <8 x double> %fpext
7328}
7329
7330define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
7331; GCN-LABEL: global_extload_v16bf16_to_v16f64:
7332; GCN:       ; %bb.0:
7333; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7334; GCN-NEXT:    s_mov_b32 s6, 0
7335; GCN-NEXT:    s_mov_b32 s7, 0xf000
7336; GCN-NEXT:    s_mov_b32 s4, s6
7337; GCN-NEXT:    s_mov_b32 s5, s6
7338; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7339; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7340; GCN-NEXT:    s_waitcnt vmcnt(1)
7341; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7342; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7343; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
7344; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
7345; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
7346; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
7347; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7348; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
7349; GCN-NEXT:    s_waitcnt vmcnt(0)
7350; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
7351; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
7352; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
7353; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
7354; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
7355; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
7356; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
7357; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
7358; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7359; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7360; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
7361; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
7362; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
7363; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
7364; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
7365; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
7366; GCN-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7367; GCN-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7368; GCN-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7369; GCN-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7370; GCN-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7371; GCN-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
7372; GCN-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7373; GCN-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7374; GCN-NEXT:    s_setpc_b64 s[30:31]
7375;
7376; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
7377; GFX7:       ; %bb.0:
7378; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7379; GFX7-NEXT:    s_mov_b32 s6, 0
7380; GFX7-NEXT:    s_mov_b32 s7, 0xf000
7381; GFX7-NEXT:    s_mov_b32 s4, s6
7382; GFX7-NEXT:    s_mov_b32 s5, s6
7383; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7384; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7385; GFX7-NEXT:    s_waitcnt vmcnt(1)
7386; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7387; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7388; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
7389; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
7390; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
7391; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
7392; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7393; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
7394; GFX7-NEXT:    s_waitcnt vmcnt(0)
7395; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
7396; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
7397; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
7398; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
7399; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
7400; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
7401; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
7402; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
7403; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7404; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7405; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
7406; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
7407; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
7408; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
7409; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
7410; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
7411; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7412; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7413; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7414; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7415; GFX7-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7416; GFX7-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
7417; GFX7-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7418; GFX7-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7419; GFX7-NEXT:    s_setpc_b64 s[30:31]
7420;
7421; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
7422; GFX8:       ; %bb.0:
7423; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
7425; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7426; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7427; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
7428; GFX8-NEXT:    s_waitcnt vmcnt(1)
7429; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7430; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7431; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
7432; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
7433; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
7434; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
7435; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7436; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
7437; GFX8-NEXT:    s_waitcnt vmcnt(0)
7438; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
7439; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
7440; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
7441; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
7442; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
7443; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
7444; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
7445; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
7446; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7447; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7448; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
7449; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
7450; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
7451; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
7452; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
7453; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
7454; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7455; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7456; GFX8-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7457; GFX8-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7458; GFX8-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7459; GFX8-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
7460; GFX8-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7461; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7462; GFX8-NEXT:    s_setpc_b64 s[30:31]
7463;
7464; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
7465; GFX9:       ; %bb.0:
7466; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7467; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
7468; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
7469; GFX9-NEXT:    s_waitcnt vmcnt(1)
7470; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7471; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7472; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
7473; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
7474; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
7475; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
7476; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7477; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
7478; GFX9-NEXT:    s_waitcnt vmcnt(0)
7479; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
7480; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
7481; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
7482; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
7483; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
7484; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
7485; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
7486; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
7487; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7488; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7489; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
7490; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
7491; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
7492; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
7493; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
7494; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
7495; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7496; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7497; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7498; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7499; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7500; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
7501; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7502; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7503; GFX9-NEXT:    s_setpc_b64 s[30:31]
7504;
7505; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
7506; GFX10:       ; %bb.0:
7507; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7508; GFX10-NEXT:    s_clause 0x1
7509; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
7510; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
7511; GFX10-NEXT:    s_waitcnt vmcnt(1)
7512; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
7513; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
7514; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
7515; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
7516; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
7517; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
7518; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7519; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
7520; GFX10-NEXT:    s_waitcnt vmcnt(0)
7521; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
7522; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
7523; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
7524; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
7525; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
7526; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v11
7527; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v12
7528; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v12
7529; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7530; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7531; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
7532; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
7533; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7534; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
7535; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
7536; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
7537; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7538; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7539; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7540; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7541; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7542; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
7543; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7544; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7545; GFX10-NEXT:    s_setpc_b64 s[30:31]
7546;
7547; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
7548; GFX11:       ; %bb.0:
7549; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7550; GFX11-NEXT:    s_clause 0x1
7551; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
7552; GFX11-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:16
7553; GFX11-NEXT:    s_waitcnt vmcnt(1)
7554; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
7555; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
7556; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
7557; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
7558; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
7559; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
7560; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
7561; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
7562; GFX11-NEXT:    s_waitcnt vmcnt(0)
7563; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
7564; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
7565; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
7566; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
7567; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
7568; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v25
7569; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
7570; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v26
7571; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
7572; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
7573; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
7574; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
7575; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
7576; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
7577; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
7578; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
7579; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7580; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
7581; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
7582; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
7583; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
7584; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v27
7585; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
7586; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
7587; GFX11-NEXT:    s_setpc_b64 s[30:31]
7588  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
7589  %fpext = fpext <16 x bfloat> %load to <16 x double>
7590  ret <16 x double> %fpext
7591}
7592
7593define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
7594; GCN-LABEL: global_extload_v32bf16_to_v32f64:
7595; GCN:       ; %bb.0:
7596; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7597; GCN-NEXT:    s_mov_b32 s6, 0
7598; GCN-NEXT:    s_mov_b32 s7, 0xf000
7599; GCN-NEXT:    s_mov_b32 s4, s6
7600; GCN-NEXT:    s_mov_b32 s5, s6
7601; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
7602; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
7603; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
7604; GCN-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
7605; GCN-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
7606; GCN-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
7607; GCN-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
7608; GCN-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
7609; GCN-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
7610; GCN-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
7611; GCN-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
7612; GCN-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
7613; GCN-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
7614; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
7615; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
7616; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
7617; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
7618; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
7619; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
7620; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
7621; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
7622; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
7623; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
7624; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
7625; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
7626; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
7627; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
7628; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
7629; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
7630; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
7631; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
7632; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
7633; GCN-NEXT:    s_waitcnt vmcnt(8)
7634; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
7635; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xfc, v0
7636; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7637; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
7638; GCN-NEXT:    s_waitcnt expcnt(0)
7639; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
7640; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7641; GCN-NEXT:    s_waitcnt expcnt(0)
7642; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
7643; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xf4, v0
7644; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7645; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
7646; GCN-NEXT:    s_waitcnt expcnt(0)
7647; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
7648; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7649; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xec, v0
7650; GCN-NEXT:    s_waitcnt expcnt(0)
7651; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
7652; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7653; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
7654; GCN-NEXT:    s_waitcnt expcnt(0)
7655; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
7656; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7657; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xe4, v0
7658; GCN-NEXT:    s_waitcnt expcnt(0)
7659; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
7660; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7661; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
7662; GCN-NEXT:    s_waitcnt expcnt(0)
7663; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
7664; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7665; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xdc, v0
7666; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xd8, v0
7667; GCN-NEXT:    s_waitcnt expcnt(0)
7668; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
7669; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7670; GCN-NEXT:    buffer_store_dword v2, v27, s[0:3], 0 offen
7671; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xd4, v0
7672; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
7673; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xd0, v0
7674; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xcc, v0
7675; GCN-NEXT:    s_waitcnt expcnt(0)
7676; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
7677; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7678; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
7679; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xc8, v0
7680; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
7681; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xc4, v0
7682; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xc0, v0
7683; GCN-NEXT:    s_waitcnt expcnt(0)
7684; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
7685; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7686; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
7687; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0xbc, v0
7688; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
7689; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xb8, v0
7690; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xb4, v0
7691; GCN-NEXT:    s_waitcnt expcnt(0)
7692; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
7693; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7694; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
7695; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0xb0, v0
7696; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
7697; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xac, v0
7698; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa8, v0
7699; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
7700; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
7701; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7702; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
7703; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0xa4, v0
7704; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
7705; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xa0, v0
7706; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x9c, v0
7707; GCN-NEXT:    s_waitcnt expcnt(0)
7708; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
7709; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7710; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
7711; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x98, v0
7712; GCN-NEXT:    buffer_store_dword v1, v23, s[0:3], 0 offen
7713; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x94, v0
7714; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x90, v0
7715; GCN-NEXT:    s_waitcnt expcnt(0)
7716; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
7717; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7718; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
7719; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x8c, v0
7720; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
7721; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x88, v0
7722; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x84, v0
7723; GCN-NEXT:    s_waitcnt expcnt(0)
7724; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
7725; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7726; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
7727; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x80, v0
7728; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
7729; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x7c, v0
7730; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
7731; GCN-NEXT:    s_waitcnt expcnt(0)
7732; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
7733; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7734; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
7735; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x74, v0
7736; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
7737; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x70, v0
7738; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x6c, v0
7739; GCN-NEXT:    s_waitcnt expcnt(0)
7740; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
7741; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7742; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
7743; GCN-NEXT:    v_add_i32_e32 v21, vcc, 0x68, v0
7744; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
7745; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x64, v0
7746; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x60, v0
7747; GCN-NEXT:    s_waitcnt expcnt(0)
7748; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
7749; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7750; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
7751; GCN-NEXT:    v_add_i32_e32 v20, vcc, 0x5c, v0
7752; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
7753; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x58, v0
7754; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x54, v0
7755; GCN-NEXT:    s_waitcnt expcnt(0)
7756; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
7757; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7758; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
7759; GCN-NEXT:    v_add_i32_e32 v19, vcc, 0x50, v0
7760; GCN-NEXT:    buffer_store_dword v1, v24, s[0:3], 0 offen
7761; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x4c, v0
7762; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x48, v0
7763; GCN-NEXT:    s_waitcnt expcnt(0)
7764; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
7765; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7766; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
7767; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x44, v0
7768; GCN-NEXT:    buffer_store_dword v1, v31, s[0:3], 0 offen
7769; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
7770; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
7771; GCN-NEXT:    s_waitcnt expcnt(0)
7772; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
7773; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7774; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
7775; GCN-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
7776; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
7777; GCN-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
7778; GCN-NEXT:    v_add_i32_e32 v28, vcc, 48, v0
7779; GCN-NEXT:    s_waitcnt expcnt(0)
7780; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
7781; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7782; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
7783; GCN-NEXT:    v_add_i32_e32 v29, vcc, 44, v0
7784; GCN-NEXT:    buffer_store_dword v1, v21, s[0:3], 0 offen
7785; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
7786; GCN-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
7787; GCN-NEXT:    s_waitcnt expcnt(0)
7788; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v15
7789; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7790; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
7791; GCN-NEXT:    v_add_i32_e32 v23, vcc, 32, v0
7792; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
7793; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
7794; GCN-NEXT:    v_add_i32_e32 v34, vcc, 24, v0
7795; GCN-NEXT:    s_waitcnt expcnt(0)
7796; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
7797; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7798; GCN-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
7799; GCN-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
7800; GCN-NEXT:    buffer_store_dword v1, v26, s[0:3], 0 offen
7801; GCN-NEXT:    v_add_i32_e32 v26, vcc, 16, v0
7802; GCN-NEXT:    v_add_i32_e32 v35, vcc, 12, v0
7803; GCN-NEXT:    s_waitcnt expcnt(0)
7804; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
7805; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7806; GCN-NEXT:    buffer_store_dword v2, v27, s[0:3], 0 offen
7807; GCN-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
7808; GCN-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
7809; GCN-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
7810; GCN-NEXT:    s_waitcnt expcnt(0)
7811; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
7812; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
7813; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
7814; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
7815; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
7816; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
7817; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
7818; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
7819; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
7820; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
7821; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7822; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
7823; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
7824; GCN-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
7825; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
7826; GCN-NEXT:    s_waitcnt expcnt(0)
7827; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v9
7828; GCN-NEXT:    v_cvt_f64_f32_e32 v[7:8], v12
7829; GCN-NEXT:    v_cvt_f64_f32_e32 v[9:10], v36
7830; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
7831; GCN-NEXT:    v_cvt_f64_f32_e32 v[11:12], v13
7832; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
7833; GCN-NEXT:    s_waitcnt expcnt(0)
7834; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v14
7835; GCN-NEXT:    v_cvt_f64_f32_e32 v[13:14], v15
7836; GCN-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
7837; GCN-NEXT:    buffer_store_dword v6, v31, s[0:3], 0 offen
7838; GCN-NEXT:    buffer_store_dword v5, v17, s[0:3], 0 offen
7839; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
7840; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
7841; GCN-NEXT:    buffer_store_dword v10, v29, s[0:3], 0 offen
7842; GCN-NEXT:    buffer_store_dword v9, v21, s[0:3], 0 offen
7843; GCN-NEXT:    buffer_store_dword v16, v33, s[0:3], 0 offen
7844; GCN-NEXT:    buffer_store_dword v15, v23, s[0:3], 0 offen
7845; GCN-NEXT:    buffer_store_dword v14, v30, s[0:3], 0 offen
7846; GCN-NEXT:    buffer_store_dword v13, v34, s[0:3], 0 offen
7847; GCN-NEXT:    buffer_store_dword v4, v20, s[0:3], 0 offen
7848; GCN-NEXT:    buffer_store_dword v3, v26, s[0:3], 0 offen
7849; GCN-NEXT:    buffer_store_dword v12, v35, s[0:3], 0 offen
7850; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
7851; GCN-NEXT:    buffer_store_dword v8, v19, s[0:3], 0 offen
7852; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
7853; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7854; GCN-NEXT:    s_setpc_b64 s[30:31]
7855;
7856; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
7857; GFX7:       ; %bb.0:
7858; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7859; GFX7-NEXT:    s_mov_b32 s6, 0
7860; GFX7-NEXT:    s_mov_b32 s7, 0xf000
7861; GFX7-NEXT:    s_mov_b32 s4, s6
7862; GFX7-NEXT:    s_mov_b32 s5, s6
7863; GFX7-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
7864; GFX7-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
7865; GFX7-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
7866; GFX7-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
7867; GFX7-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
7868; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
7869; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
7870; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
7871; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
7872; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
7873; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
7874; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
7875; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
7876; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
7877; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
7878; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
7879; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
7880; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
7881; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
7882; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
7883; GFX7-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
7884; GFX7-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
7885; GFX7-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
7886; GFX7-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
7887; GFX7-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
7888; GFX7-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
7889; GFX7-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
7890; GFX7-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
7891; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
7892; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
7893; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
7894; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
7895; GFX7-NEXT:    s_waitcnt vmcnt(14)
7896; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
7897; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7898; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xfc, v0
7899; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7900; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
7901; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7902; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
7903; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7904; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xf4, v0
7905; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0xd8, v0
7906; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7907; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
7908; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7909; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
7910; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7911; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xec, v0
7912; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0xd4, v0
7913; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7914; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
7915; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7916; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
7917; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7918; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xe4, v0
7919; GFX7-NEXT:    v_add_i32_e32 v20, vcc, 0xd0, v0
7920; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7921; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
7922; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
7923; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7924; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v17
7925; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xdc, v0
7926; GFX7-NEXT:    s_waitcnt vmcnt(14)
7927; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
7928; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7929; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
7930; GFX7-NEXT:    buffer_store_dword v1, v18, s[0:3], 0 offen
7931; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v2
7932; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v23
7933; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
7934; GFX7-NEXT:    buffer_store_dword v2, v19, s[0:3], 0 offen
7935; GFX7-NEXT:    buffer_store_dword v1, v20, s[0:3], 0 offen
7936; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xcc, v0
7937; GFX7-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
7938; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
7939; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7940; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0xc8, v0
7941; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
7942; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xc4, v0
7943; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7944; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
7945; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
7946; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
7947; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7948; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xbc, v0
7949; GFX7-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
7950; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
7951; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7952; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0xb8, v0
7953; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
7954; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xb4, v0
7955; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7956; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
7957; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
7958; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb0, v0
7959; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7960; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xac, v0
7961; GFX7-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
7962; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
7963; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7964; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0xa8, v0
7965; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
7966; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0xa4, v0
7967; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7968; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v27
7969; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
7970; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa0, v0
7971; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7972; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x9c, v0
7973; GFX7-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
7974; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
7975; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7976; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x98, v0
7977; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
7978; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0x94, v0
7979; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7980; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
7981; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
7982; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x90, v0
7983; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7984; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x8c, v0
7985; GFX7-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
7986; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x88, v0
7987; GFX7-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
7988; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
7989; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
7990; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0x84, v0
7991; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
7992; GFX7-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
7993; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x80, v0
7994; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
7995; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v16
7996; GFX7-NEXT:    s_waitcnt vmcnt(14)
7997; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
7998; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
7999; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x7c, v0
8000; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0x74, v0
8001; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
8002; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0x78, v0
8003; GFX7-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8004; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
8005; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
8006; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
8007; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
8008; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
8009; GFX7-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
8010; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x70, v0
8011; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
8012; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v13
8013; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
8014; GFX7-NEXT:    v_cvt_f64_f32_e32 v[13:14], v13
8015; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0x6c, v0
8016; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
8017; GFX7-NEXT:    buffer_store_dword v14, v19, s[0:3], 0 offen
8018; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 0x68, v0
8019; GFX7-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
8020; GFX7-NEXT:    v_cvt_f64_f32_e32 v[13:14], v11
8021; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
8022; GFX7-NEXT:    v_cvt_f64_f32_e32 v[11:12], v11
8023; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0x64, v0
8024; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
8025; GFX7-NEXT:    buffer_store_dword v12, v19, s[0:3], 0 offen
8026; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 0x60, v0
8027; GFX7-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
8028; GFX7-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
8029; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
8030; GFX7-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
8031; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0x5c, v0
8032; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
8033; GFX7-NEXT:    buffer_store_dword v10, v19, s[0:3], 0 offen
8034; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x58, v0
8035; GFX7-NEXT:    v_cvt_f64_f32_e32 v[19:20], v7
8036; GFX7-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
8037; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
8038; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
8039; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
8040; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
8041; GFX7-NEXT:    buffer_store_dword v20, v7, s[0:3], 0 offen
8042; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
8043; GFX7-NEXT:    buffer_store_dword v19, v7, s[0:3], 0 offen
8044; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 0x4c, v0
8045; GFX7-NEXT:    buffer_store_dword v5, v19, s[0:3], 0 offen
8046; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x48, v0
8047; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
8048; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
8049; GFX7-NEXT:    v_cvt_f64_f32_e32 v[19:20], v10
8050; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x44, v0
8051; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
8052; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
8053; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
8054; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
8055; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
8056; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
8057; GFX7-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
8058; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
8059; GFX7-NEXT:    buffer_store_dword v19, v3, s[0:3], 0 offen
8060; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
8061; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
8062; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
8063; GFX7-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
8064; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
8065; GFX7-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen
8066; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
8067; GFX7-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
8068; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
8069; GFX7-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
8070; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
8071; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
8072; GFX7-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
8073; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
8074; GFX7-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8075; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
8076; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
8077; GFX7-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
8078; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
8079; GFX7-NEXT:    buffer_store_dword v18, v3, s[0:3], 0 offen
8080; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
8081; GFX7-NEXT:    buffer_store_dword v17, v3, s[0:3], 0 offen
8082; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
8083; GFX7-NEXT:    buffer_store_dword v16, v3, s[0:3], 0 offen
8084; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
8085; GFX7-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
8086; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
8087; GFX7-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
8088; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
8089; GFX7-NEXT:    s_waitcnt vmcnt(0)
8090; GFX7-NEXT:    s_setpc_b64 s[30:31]
8091;
8092; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
8093; GFX8:       ; %bb.0:
8094; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8095; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 2, v1
8096; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
8097; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v1
8098; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
8099; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 6, v1
8100; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
8101; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 8, v1
8102; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v2, vcc
8103; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 10, v1
8104; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v2, vcc
8105; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 12, v1
8106; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v2, vcc
8107; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 14, v1
8108; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v2, vcc
8109; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 16, v1
8110; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v2, vcc
8111; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 18, v1
8112; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v2, vcc
8113; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 20, v1
8114; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v2, vcc
8115; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 22, v1
8116; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
8117; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 24, v1
8118; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v2, vcc
8119; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 26, v1
8120; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v2, vcc
8121; GFX8-NEXT:    v_add_u32_e32 v29, vcc, 28, v1
8122; GFX8-NEXT:    v_addc_u32_e32 v30, vcc, 0, v2, vcc
8123; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 30, v1
8124; GFX8-NEXT:    v_addc_u32_e32 v32, vcc, 0, v2, vcc
8125; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 34, v1
8126; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
8127; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 36, v1
8128; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
8129; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
8130; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
8131; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
8132; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
8133; GFX8-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
8134; GFX8-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
8135; GFX8-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
8136; GFX8-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
8137; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
8138; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
8139; GFX8-NEXT:    buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
8140; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
8141; GFX8-NEXT:    flat_load_ushort v44, v[1:2]
8142; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
8143; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 40, v1
8144; GFX8-NEXT:    v_addc_u32_e32 v49, vcc, 0, v2, vcc
8145; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 62, v1
8146; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
8147; GFX8-NEXT:    flat_load_ushort v45, v[50:51]
8148; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 60, v1
8149; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
8150; GFX8-NEXT:    flat_load_ushort v46, v[50:51]
8151; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 42, v1
8152; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
8153; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 58, v1
8154; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
8155; GFX8-NEXT:    flat_load_ushort v47, v[52:53]
8156; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 44, v1
8157; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
8158; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 56, v1
8159; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
8160; GFX8-NEXT:    flat_load_ushort v56, v[54:55]
8161; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 46, v1
8162; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
8163; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 54, v1
8164; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
8165; GFX8-NEXT:    flat_load_ushort v57, v[39:40]
8166; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 52, v1
8167; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
8168; GFX8-NEXT:    flat_load_ushort v58, v[39:40]
8169; GFX8-NEXT:    v_add_u32_e32 v40, vcc, 48, v1
8170; GFX8-NEXT:    v_addc_u32_e32 v41, vcc, 0, v2, vcc
8171; GFX8-NEXT:    v_add_u32_e32 v42, vcc, 50, v1
8172; GFX8-NEXT:    v_addc_u32_e32 v43, vcc, 0, v2, vcc
8173; GFX8-NEXT:    flat_load_ushort v42, v[42:43]
8174; GFX8-NEXT:    flat_load_ushort v34, v[33:34]
8175; GFX8-NEXT:    flat_load_ushort v36, v[35:36]
8176; GFX8-NEXT:    flat_load_ushort v38, v[37:38]
8177; GFX8-NEXT:    flat_load_ushort v39, v[48:49]
8178; GFX8-NEXT:    flat_load_ushort v48, v[50:51]
8179; GFX8-NEXT:    flat_load_ushort v51, v[52:53]
8180; GFX8-NEXT:    flat_load_ushort v52, v[54:55]
8181; GFX8-NEXT:    flat_load_ushort v53, v[40:41]
8182; GFX8-NEXT:    v_add_u32_e32 v49, vcc, 32, v1
8183; GFX8-NEXT:    v_addc_u32_e32 v50, vcc, 0, v2, vcc
8184; GFX8-NEXT:    flat_load_ushort v37, v[3:4]
8185; GFX8-NEXT:    flat_load_ushort v35, v[5:6]
8186; GFX8-NEXT:    flat_load_ushort v33, v[7:8]
8187; GFX8-NEXT:    flat_load_ushort v8, v[9:10]
8188; GFX8-NEXT:    flat_load_ushort v6, v[11:12]
8189; GFX8-NEXT:    flat_load_ushort v4, v[13:14]
8190; GFX8-NEXT:    flat_load_ushort v2, v[15:16]
8191; GFX8-NEXT:    flat_load_ushort v1, v[19:20]
8192; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 4, v0
8193; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0x7c, v0
8194; GFX8-NEXT:    s_waitcnt vmcnt(14)
8195; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
8196; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
8197; GFX8-NEXT:    flat_load_ushort v3, v[17:18]
8198; GFX8-NEXT:    flat_load_ushort v5, v[21:22]
8199; GFX8-NEXT:    flat_load_ushort v7, v[23:24]
8200; GFX8-NEXT:    flat_load_ushort v9, v[25:26]
8201; GFX8-NEXT:    flat_load_ushort v10, v[27:28]
8202; GFX8-NEXT:    flat_load_ushort v11, v[29:30]
8203; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
8204; GFX8-NEXT:    flat_load_ushort v13, v[49:50]
8205; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0x84, v0
8206; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
8207; GFX8-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
8208; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xfc, v0
8209; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v45
8210; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8211; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8212; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v46
8213; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8214; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xf8, v0
8215; GFX8-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
8216; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xf4, v0
8217; GFX8-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
8218; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v47
8219; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
8220; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xf0, v0
8221; GFX8-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8222; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xec, v0
8223; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
8224; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xe8, v0
8225; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v56
8226; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8227; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
8228; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xe4, v0
8229; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8230; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xe0, v0
8231; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v57
8232; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8233; GFX8-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
8234; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xdc, v0
8235; GFX8-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
8236; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v58
8237; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
8238; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xd8, v0
8239; GFX8-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8240; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xd4, v0
8241; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
8242; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v42
8243; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8244; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xd0, v0
8245; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
8246; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xcc, v0
8247; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8248; GFX8-NEXT:    s_waitcnt vmcnt(14)
8249; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
8250; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8251; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xc8, v0
8252; GFX8-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
8253; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xc4, v0
8254; GFX8-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
8255; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v52
8256; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
8257; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xc0, v0
8258; GFX8-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8259; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xbc, v0
8260; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
8261; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v51
8262; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8263; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xb8, v0
8264; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
8265; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xb4, v0
8266; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8267; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
8268; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8269; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xb0, v0
8270; GFX8-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
8271; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xac, v0
8272; GFX8-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
8273; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
8274; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
8275; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xa8, v0
8276; GFX8-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8277; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xa4, v0
8278; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
8279; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v38
8280; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
8281; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xa0, v0
8282; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
8283; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x9c, v0
8284; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8285; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v36
8286; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8287; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x98, v0
8288; GFX8-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
8289; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x94, v0
8290; GFX8-NEXT:    buffer_store_dword v17, v14, s[0:3], 0 offen
8291; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x90, v0
8292; GFX8-NEXT:    buffer_store_dword v16, v14, s[0:3], 0 offen
8293; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
8294; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
8295; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0x8c, v0
8296; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v37
8297; GFX8-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
8298; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x88, v0
8299; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
8300; GFX8-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
8301; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v16
8302; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v13
8303; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v35
8304; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
8305; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
8306; GFX8-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
8307; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0x80, v0
8308; GFX8-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
8309; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v13
8310; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
8311; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
8312; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
8313; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
8314; GFX8-NEXT:    buffer_store_dword v13, v19, s[0:3], 0 offen
8315; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x78, v0
8316; GFX8-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
8317; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
8318; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v11
8319; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x74, v0
8320; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
8321; GFX8-NEXT:    buffer_store_dword v19, v11, s[0:3], 0 offen
8322; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x70, v0
8323; GFX8-NEXT:    buffer_store_dword v18, v11, s[0:3], 0 offen
8324; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v8
8325; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
8326; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
8327; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x6c, v0
8328; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8329; GFX8-NEXT:    buffer_store_dword v11, v8, s[0:3], 0 offen
8330; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x68, v0
8331; GFX8-NEXT:    buffer_store_dword v10, v8, s[0:3], 0 offen
8332; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v6
8333; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
8334; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
8335; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x64, v0
8336; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
8337; GFX8-NEXT:    buffer_store_dword v9, v6, s[0:3], 0 offen
8338; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x60, v0
8339; GFX8-NEXT:    buffer_store_dword v8, v6, s[0:3], 0 offen
8340; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
8341; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
8342; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
8343; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x5c, v0
8344; GFX8-NEXT:    buffer_store_dword v7, v4, s[0:3], 0 offen
8345; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x58, v0
8346; GFX8-NEXT:    buffer_store_dword v6, v4, s[0:3], 0 offen
8347; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
8348; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
8349; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
8350; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
8351; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x54, v0
8352; GFX8-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
8353; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
8354; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
8355; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v3
8356; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v4
8357; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
8358; GFX8-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
8359; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
8360; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
8361; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x44, v0
8362; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
8363; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 64, v0
8364; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
8365; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 60, v0
8366; GFX8-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
8367; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 56, v0
8368; GFX8-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
8369; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 52, v0
8370; GFX8-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
8371; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 48, v0
8372; GFX8-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
8373; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 44, v0
8374; GFX8-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
8375; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 40, v0
8376; GFX8-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
8377; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 36, v0
8378; GFX8-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
8379; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v0
8380; GFX8-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
8381; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 28, v0
8382; GFX8-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
8383; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 24, v0
8384; GFX8-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
8385; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 20, v0
8386; GFX8-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
8387; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 16, v0
8388; GFX8-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
8389; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 12, v0
8390; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
8391; GFX8-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
8392; GFX8-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
8393; GFX8-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
8394; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
8395; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
8396; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
8397; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
8398; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
8399; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
8400; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
8401; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
8402; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
8403; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
8404; GFX8-NEXT:    s_waitcnt vmcnt(0)
8405; GFX8-NEXT:    s_setpc_b64 s[30:31]
8406;
8407; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
8408; GFX9:       ; %bb.0:
8409; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8410; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:62
8411; GFX9-NEXT:    global_load_ushort v10, v[1:2], off offset:60
8412; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:58
8413; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:56
8414; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:54
8415; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:52
8416; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:50
8417; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:48
8418; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:46
8419; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:44
8420; GFX9-NEXT:    global_load_ushort v19, v[1:2], off offset:42
8421; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:40
8422; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:38
8423; GFX9-NEXT:    global_load_ushort v22, v[1:2], off offset:36
8424; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:34
8425; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:32
8426; GFX9-NEXT:    global_load_ushort v25, v[1:2], off
8427; GFX9-NEXT:    global_load_ushort v26, v[1:2], off offset:2
8428; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:30
8429; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
8430; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
8431; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
8432; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
8433; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:24
8434; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:26
8435; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:28
8436; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:4
8437; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:6
8438; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:8
8439; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:10
8440; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
8441; GFX9-NEXT:    s_nop 0
8442; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
8443; GFX9-NEXT:    s_waitcnt vmcnt(31)
8444; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
8445; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
8446; GFX9-NEXT:    s_waitcnt vmcnt(30)
8447; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
8448; GFX9-NEXT:    s_waitcnt vmcnt(28)
8449; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
8450; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:252
8451; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:248
8452; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
8453; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
8454; GFX9-NEXT:    s_waitcnt vmcnt(29)
8455; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
8456; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:244
8457; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:240
8458; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
8459; GFX9-NEXT:    s_waitcnt vmcnt(30)
8460; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
8461; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
8462; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
8463; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
8464; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
8465; GFX9-NEXT:    s_waitcnt vmcnt(31)
8466; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
8467; GFX9-NEXT:    s_waitcnt vmcnt(30)
8468; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
8469; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:228
8470; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:224
8471; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
8472; GFX9-NEXT:    s_waitcnt vmcnt(31)
8473; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
8474; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v13
8475; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:220
8476; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:216
8477; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v14
8478; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
8479; GFX9-NEXT:    s_waitcnt vmcnt(32)
8480; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
8481; GFX9-NEXT:    s_waitcnt vmcnt(30)
8482; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
8483; GFX9-NEXT:    s_waitcnt vmcnt(28)
8484; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
8485; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:212
8486; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:208
8487; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
8488; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
8489; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v21
8490; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:204
8491; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:200
8492; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:196
8493; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:192
8494; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v20
8495; GFX9-NEXT:    s_waitcnt vmcnt(33)
8496; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
8497; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
8498; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
8499; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v19
8500; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:188
8501; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:184
8502; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:180
8503; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:176
8504; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:172
8505; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:168
8506; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
8507; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
8508; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:156
8509; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:152
8510; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:148
8511; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
8512; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:144
8513; GFX9-NEXT:    s_waitcnt vmcnt(44)
8514; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
8515; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:140
8516; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:136
8517; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
8518; GFX9-NEXT:    s_waitcnt vmcnt(43)
8519; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
8520; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:132
8521; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:128
8522; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
8523; GFX9-NEXT:    s_waitcnt vmcnt(38)
8524; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v30
8525; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:124
8526; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:120
8527; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v14
8528; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
8529; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:116
8530; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:112
8531; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v16
8532; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
8533; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
8534; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
8535; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v2
8536; GFX9-NEXT:    s_waitcnt vmcnt(41)
8537; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
8538; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
8539; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v2
8540; GFX9-NEXT:    s_waitcnt vmcnt(40)
8541; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
8542; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:108
8543; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:104
8544; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v18
8545; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
8546; GFX9-NEXT:    s_waitcnt vmcnt(41)
8547; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
8548; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v2
8549; GFX9-NEXT:    s_waitcnt vmcnt(40)
8550; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
8551; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
8552; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
8553; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
8554; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
8555; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
8556; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
8557; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v2
8558; GFX9-NEXT:    s_waitcnt vmcnt(41)
8559; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
8560; GFX9-NEXT:    s_waitcnt vmcnt(40)
8561; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
8562; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
8563; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
8564; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
8565; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
8566; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
8567; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
8568; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
8569; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
8570; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
8571; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
8572; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
8573; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v22
8574; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
8575; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
8576; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
8577; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
8578; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
8579; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
8580; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:44
8581; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:40
8582; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:36
8583; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:32
8584; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:28
8585; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
8586; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
8587; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:16
8588; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:12
8589; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:8
8590; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:4
8591; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
8592; GFX9-NEXT:    s_waitcnt vmcnt(0)
8593; GFX9-NEXT:    s_setpc_b64 s[30:31]
8594;
8595; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
8596; GFX10:       ; %bb.0:
8597; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8598; GFX10-NEXT:    s_clause 0x1f
8599; GFX10-NEXT:    global_load_ushort v3, v[1:2], off
8600; GFX10-NEXT:    global_load_ushort v4, v[1:2], off offset:2
8601; GFX10-NEXT:    global_load_ushort v5, v[1:2], off offset:4
8602; GFX10-NEXT:    global_load_ushort v6, v[1:2], off offset:6
8603; GFX10-NEXT:    global_load_ushort v7, v[1:2], off offset:8
8604; GFX10-NEXT:    global_load_ushort v8, v[1:2], off offset:10
8605; GFX10-NEXT:    global_load_ushort v9, v[1:2], off offset:12
8606; GFX10-NEXT:    global_load_ushort v10, v[1:2], off offset:14
8607; GFX10-NEXT:    global_load_ushort v11, v[1:2], off offset:16
8608; GFX10-NEXT:    global_load_ushort v12, v[1:2], off offset:18
8609; GFX10-NEXT:    global_load_ushort v13, v[1:2], off offset:20
8610; GFX10-NEXT:    global_load_ushort v14, v[1:2], off offset:22
8611; GFX10-NEXT:    global_load_ushort v15, v[1:2], off offset:24
8612; GFX10-NEXT:    global_load_ushort v16, v[1:2], off offset:26
8613; GFX10-NEXT:    global_load_ushort v17, v[1:2], off offset:28
8614; GFX10-NEXT:    global_load_ushort v18, v[1:2], off offset:30
8615; GFX10-NEXT:    global_load_ushort v19, v[1:2], off offset:62
8616; GFX10-NEXT:    global_load_ushort v20, v[1:2], off offset:32
8617; GFX10-NEXT:    global_load_ushort v21, v[1:2], off offset:34
8618; GFX10-NEXT:    global_load_ushort v22, v[1:2], off offset:36
8619; GFX10-NEXT:    global_load_ushort v23, v[1:2], off offset:60
8620; GFX10-NEXT:    global_load_ushort v24, v[1:2], off offset:38
8621; GFX10-NEXT:    global_load_ushort v25, v[1:2], off offset:40
8622; GFX10-NEXT:    global_load_ushort v26, v[1:2], off offset:58
8623; GFX10-NEXT:    global_load_ushort v27, v[1:2], off offset:42
8624; GFX10-NEXT:    global_load_ushort v28, v[1:2], off offset:44
8625; GFX10-NEXT:    global_load_ushort v29, v[1:2], off offset:56
8626; GFX10-NEXT:    global_load_ushort v30, v[1:2], off offset:46
8627; GFX10-NEXT:    global_load_ushort v31, v[1:2], off offset:48
8628; GFX10-NEXT:    global_load_ushort v32, v[1:2], off offset:54
8629; GFX10-NEXT:    global_load_ushort v33, v[1:2], off offset:50
8630; GFX10-NEXT:    global_load_ushort v34, v[1:2], off offset:52
8631; GFX10-NEXT:    s_waitcnt vmcnt(31)
8632; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
8633; GFX10-NEXT:    s_waitcnt vmcnt(30)
8634; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v4
8635; GFX10-NEXT:    s_waitcnt vmcnt(29)
8636; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
8637; GFX10-NEXT:    s_waitcnt vmcnt(28)
8638; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v6
8639; GFX10-NEXT:    s_waitcnt vmcnt(27)
8640; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
8641; GFX10-NEXT:    s_waitcnt vmcnt(26)
8642; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v8
8643; GFX10-NEXT:    s_waitcnt vmcnt(25)
8644; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v9
8645; GFX10-NEXT:    s_waitcnt vmcnt(24)
8646; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
8647; GFX10-NEXT:    s_waitcnt vmcnt(23)
8648; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v11
8649; GFX10-NEXT:    s_waitcnt vmcnt(22)
8650; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v12
8651; GFX10-NEXT:    s_waitcnt vmcnt(21)
8652; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v13
8653; GFX10-NEXT:    s_waitcnt vmcnt(20)
8654; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
8655; GFX10-NEXT:    v_cvt_f64_f32_e32 v[9:10], v35
8656; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v36
8657; GFX10-NEXT:    s_waitcnt vmcnt(17)
8658; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v17
8659; GFX10-NEXT:    s_waitcnt vmcnt(16)
8660; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v18
8661; GFX10-NEXT:    s_waitcnt vmcnt(15)
8662; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
8663; GFX10-NEXT:    s_waitcnt vmcnt(14)
8664; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v20
8665; GFX10-NEXT:    s_waitcnt vmcnt(13)
8666; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v21
8667; GFX10-NEXT:    s_waitcnt vmcnt(12)
8668; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v22
8669; GFX10-NEXT:    s_waitcnt vmcnt(11)
8670; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
8671; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
8672; GFX10-NEXT:    s_waitcnt vmcnt(9)
8673; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v25
8674; GFX10-NEXT:    s_waitcnt vmcnt(8)
8675; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v26
8676; GFX10-NEXT:    s_waitcnt vmcnt(7)
8677; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v27
8678; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
8679; GFX10-NEXT:    s_waitcnt vmcnt(5)
8680; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v29
8681; GFX10-NEXT:    s_waitcnt vmcnt(4)
8682; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v30
8683; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
8684; GFX10-NEXT:    s_waitcnt vmcnt(2)
8685; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
8686; GFX10-NEXT:    s_waitcnt vmcnt(1)
8687; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v33
8688; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
8689; GFX10-NEXT:    s_waitcnt vmcnt(0)
8690; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v34
8691; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v31
8692; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v11
8693; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v28
8694; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v24
8695; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
8696; GFX10-NEXT:    v_cvt_f64_f32_e32 v[31:32], v71
8697; GFX10-NEXT:    v_cvt_f64_f32_e32 v[35:36], v68
8698; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v16
8699; GFX10-NEXT:    v_cvt_f64_f32_e32 v[33:34], v70
8700; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v15
8701; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
8702; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
8703; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v23
8704; GFX10-NEXT:    v_cvt_f64_f32_e32 v[15:16], v37
8705; GFX10-NEXT:    v_cvt_f64_f32_e32 v[17:18], v38
8706; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
8707; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
8708; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v25
8709; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v66
8710; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
8711; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
8712; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v27
8713; GFX10-NEXT:    v_cvt_f64_f32_e32 v[23:24], v48
8714; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
8715; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
8716; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v81
8717; GFX10-NEXT:    v_cvt_f64_f32_e32 v[25:26], v49
8718; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
8719; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
8720; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v80
8721; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
8722; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
8723; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v69
8724; GFX10-NEXT:    v_cvt_f64_f32_e32 v[48:49], v64
8725; GFX10-NEXT:    v_cvt_f64_f32_e32 v[27:28], v50
8726; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v51
8727; GFX10-NEXT:    v_cvt_f64_f32_e32 v[50:51], v54
8728; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
8729; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
8730; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v67
8731; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v39
8732; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
8733; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
8734; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v65
8735; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
8736; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
8737; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v55
8738; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
8739; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
8740; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v53
8741; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
8742; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
8743; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v52
8744; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
8745; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
8746; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
8747; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
8748; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
8749; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
8750; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
8751; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
8752; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
8753; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
8754; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
8755; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
8756; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
8757; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
8758; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
8759; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
8760; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
8761; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
8762; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
8763; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
8764; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
8765; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
8766; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
8767; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
8768; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
8769; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
8770; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
8771; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
8772; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
8773; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
8774; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
8775; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
8776; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
8777; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
8778; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
8779; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
8780; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
8781; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
8782; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
8783; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
8784; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
8785; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
8786; GFX10-NEXT:    s_setpc_b64 s[30:31]
8787;
8788; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
8789; GFX11:       ; %bb.0:
8790; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8791; GFX11-NEXT:    s_clause 0x1f
8792; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:12
8793; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:8
8794; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:4
8795; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:2
8796; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
8797; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
8798; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
8799; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
8800; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:28
8801; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:24
8802; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:20
8803; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:18
8804; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
8805; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
8806; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
8807; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
8808; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:44
8809; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:40
8810; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:36
8811; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:34
8812; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
8813; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
8814; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
8815; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
8816; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:60
8817; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:56
8818; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:52
8819; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:50
8820; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
8821; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
8822; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
8823; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
8824; GFX11-NEXT:    s_waitcnt vmcnt(31)
8825; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
8826; GFX11-NEXT:    s_waitcnt vmcnt(30)
8827; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
8828; GFX11-NEXT:    s_waitcnt vmcnt(29)
8829; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8830; GFX11-NEXT:    s_waitcnt vmcnt(28)
8831; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
8832; GFX11-NEXT:    s_waitcnt vmcnt(27)
8833; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
8834; GFX11-NEXT:    s_waitcnt vmcnt(26)
8835; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
8836; GFX11-NEXT:    s_waitcnt vmcnt(25)
8837; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
8838; GFX11-NEXT:    s_waitcnt vmcnt(24)
8839; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
8840; GFX11-NEXT:    s_waitcnt vmcnt(23)
8841; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
8842; GFX11-NEXT:    s_waitcnt vmcnt(22)
8843; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v12
8844; GFX11-NEXT:    s_waitcnt vmcnt(21)
8845; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
8846; GFX11-NEXT:    s_waitcnt vmcnt(20)
8847; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
8848; GFX11-NEXT:    s_waitcnt vmcnt(19)
8849; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v15
8850; GFX11-NEXT:    s_waitcnt vmcnt(18)
8851; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
8852; GFX11-NEXT:    s_waitcnt vmcnt(17)
8853; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
8854; GFX11-NEXT:    s_waitcnt vmcnt(16)
8855; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
8856; GFX11-NEXT:    s_waitcnt vmcnt(15)
8857; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v19
8858; GFX11-NEXT:    s_waitcnt vmcnt(14)
8859; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
8860; GFX11-NEXT:    s_waitcnt vmcnt(13)
8861; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
8862; GFX11-NEXT:    s_waitcnt vmcnt(12)
8863; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
8864; GFX11-NEXT:    s_waitcnt vmcnt(11)
8865; GFX11-NEXT:    v_lshlrev_b32_e32 v103, 16, v23
8866; GFX11-NEXT:    s_waitcnt vmcnt(10)
8867; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
8868; GFX11-NEXT:    s_waitcnt vmcnt(9)
8869; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
8870; GFX11-NEXT:    s_waitcnt vmcnt(8)
8871; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
8872; GFX11-NEXT:    s_waitcnt vmcnt(7)
8873; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v27
8874; GFX11-NEXT:    s_waitcnt vmcnt(6)
8875; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
8876; GFX11-NEXT:    s_waitcnt vmcnt(5)
8877; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
8878; GFX11-NEXT:    s_waitcnt vmcnt(4)
8879; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
8880; GFX11-NEXT:    s_waitcnt vmcnt(3)
8881; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
8882; GFX11-NEXT:    s_waitcnt vmcnt(2)
8883; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
8884; GFX11-NEXT:    s_waitcnt vmcnt(1)
8885; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
8886; GFX11-NEXT:    s_waitcnt vmcnt(0)
8887; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
8888; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v68
8889; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v65
8890; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
8891; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
8892; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
8893; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v29
8894; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v30
8895; GFX11-NEXT:    v_cvt_f64_f32_e32 v[68:69], v53
8896; GFX11-NEXT:    v_cvt_f64_f32_e32 v[66:67], v26
8897; GFX11-NEXT:    v_cvt_f64_f32_e32 v[64:65], v52
8898; GFX11-NEXT:    v_cvt_f64_f32_e32 v[54:55], v25
8899; GFX11-NEXT:    v_cvt_f64_f32_e32 v[52:53], v49
8900; GFX11-NEXT:    v_cvt_f64_f32_e32 v[50:51], v48
8901; GFX11-NEXT:    v_cvt_f64_f32_e32 v[48:49], v21
8902; GFX11-NEXT:    v_cvt_f64_f32_e32 v[23:24], v34
8903; GFX11-NEXT:    v_cvt_f64_f32_e32 v[35:36], v22
8904; GFX11-NEXT:    v_cvt_f64_f32_e32 v[33:34], v103
8905; GFX11-NEXT:    v_cvt_f64_f32_e32 v[31:32], v18
8906; GFX11-NEXT:    v_cvt_f64_f32_e32 v[29:30], v102
8907; GFX11-NEXT:    v_cvt_f64_f32_e32 v[27:28], v17
8908; GFX11-NEXT:    v_cvt_f64_f32_e32 v[25:26], v101
8909; GFX11-NEXT:    v_cvt_f64_f32_e32 v[21:22], v13
8910; GFX11-NEXT:    v_cvt_f64_f32_e32 v[19:20], v14
8911; GFX11-NEXT:    v_cvt_f64_f32_e32 v[17:18], v100
8912; GFX11-NEXT:    v_cvt_f64_f32_e32 v[15:16], v10
8913; GFX11-NEXT:    v_cvt_f64_f32_e32 v[13:14], v39
8914; GFX11-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
8915; GFX11-NEXT:    v_cvt_f64_f32_e32 v[9:10], v38
8916; GFX11-NEXT:    v_cvt_f64_f32_e32 v[7:8], v6
8917; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
8918; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
8919; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
8920; GFX11-NEXT:    s_clause 0xf
8921; GFX11-NEXT:    scratch_store_b128 v0, v[96:99], off offset:240
8922; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off offset:224
8923; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off offset:208
8924; GFX11-NEXT:    scratch_store_b128 v0, v[68:71], off offset:192
8925; GFX11-NEXT:    scratch_store_b128 v0, v[64:67], off offset:176
8926; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
8927; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
8928; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
8929; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
8930; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
8931; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
8932; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
8933; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
8934; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
8935; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
8936; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
8937; GFX11-NEXT:    s_setpc_b64 s[30:31]
8938  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
8939  %fpext = fpext <32 x bfloat> %load to <32 x double>
8940  ret <32 x double> %fpext
8941}
8942
8943define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
8944; GCN-LABEL: v_fadd_bf16:
8945; GCN:       ; %bb.0:
8946; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8947; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8948; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8949; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
8950; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
8951; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
8952; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
8953; GCN-NEXT:    s_setpc_b64 s[30:31]
8954;
8955; GFX7-LABEL: v_fadd_bf16:
8956; GFX7:       ; %bb.0:
8957; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8958; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8959; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8960; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
8961; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
8962; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
8963; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
8964; GFX7-NEXT:    s_setpc_b64 s[30:31]
8965;
8966; GFX8-LABEL: v_fadd_bf16:
8967; GFX8:       ; %bb.0:
8968; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8969; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
8970; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
8971; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
8972; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
8973; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
8974; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
8975; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
8976; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
8977; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
8978; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
8979; GFX8-NEXT:    s_setpc_b64 s[30:31]
8980;
8981; GFX9-LABEL: v_fadd_bf16:
8982; GFX9:       ; %bb.0:
8983; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8984; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
8985; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
8986; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
8987; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
8988; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
8989; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
8990; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
8991; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
8992; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
8993; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
8994; GFX9-NEXT:    s_setpc_b64 s[30:31]
8995;
8996; GFX10-LABEL: v_fadd_bf16:
8997; GFX10:       ; %bb.0:
8998; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8999; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
9000; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9001; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
9002; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
9003; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
9004; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9005; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
9006; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9007; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9008; GFX10-NEXT:    s_setpc_b64 s[30:31]
9009;
9010; GFX11-LABEL: v_fadd_bf16:
9011; GFX11:       ; %bb.0:
9012; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9013; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
9014; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9015; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9016; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
9017; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
9018; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
9019; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9020; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9021; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
9022; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9023; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9024; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9025; GFX11-NEXT:    s_setpc_b64 s[30:31]
9026  %op = fadd bfloat %a, %b
9027  ret bfloat %op
9028}
9029
9030define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
9031; GCN-LABEL: v_fadd_v2bf16:
9032; GCN:       ; %bb.0:
9033; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9034; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9035; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9036; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9037; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9038; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9039; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9040; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9041; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9042; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
9043; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
9044; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9045; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9046; GCN-NEXT:    s_setpc_b64 s[30:31]
9047;
9048; GFX7-LABEL: v_fadd_v2bf16:
9049; GFX7:       ; %bb.0:
9050; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9051; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9052; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9053; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9054; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9055; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9056; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9057; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9058; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9059; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
9060; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
9061; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9062; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9063; GFX7-NEXT:    s_setpc_b64 s[30:31]
9064;
9065; GFX8-LABEL: v_fadd_v2bf16:
9066; GFX8:       ; %bb.0:
9067; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9068; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9069; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
9070; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
9071; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
9072; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
9073; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9074; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9075; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
9076; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
9077; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
9078; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
9079; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
9080; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
9081; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
9082; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
9083; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
9084; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9085; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
9086; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9087; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
9088; GFX8-NEXT:    s_setpc_b64 s[30:31]
9089;
9090; GFX9-LABEL: v_fadd_v2bf16:
9091; GFX9:       ; %bb.0:
9092; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9093; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9094; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
9095; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
9096; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9097; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9098; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
9099; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
9100; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
9101; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
9102; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
9103; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
9104; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
9105; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
9106; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
9107; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
9108; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9109; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
9110; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
9111; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
9112; GFX9-NEXT:    s_setpc_b64 s[30:31]
9113;
9114; GFX10-LABEL: v_fadd_v2bf16:
9115; GFX10:       ; %bb.0:
9116; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9117; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9118; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
9119; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9120; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9121; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
9122; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
9123; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
9124; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
9125; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
9126; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
9127; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
9128; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
9129; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
9130; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9131; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9132; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9133; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
9134; GFX10-NEXT:    s_setpc_b64 s[30:31]
9135;
9136; GFX11-LABEL: v_fadd_v2bf16:
9137; GFX11:       ; %bb.0:
9138; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9139; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9140; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9141; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
9142; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9143; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
9144; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
9145; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
9146; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9147; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
9148; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
9149; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
9150; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
9151; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
9152; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
9153; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
9154; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9155; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9156; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9157; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9159; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
9160; GFX11-NEXT:    s_setpc_b64 s[30:31]
9161  %op = fadd <2 x bfloat> %a, %b
9162  ret <2 x bfloat> %op
9163}
9164
9165define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
9166; GCN-LABEL: v_fadd_v3bf16:
9167; GCN:       ; %bb.0:
9168; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9169; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9170; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9171; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9172; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9173; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9174; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9175; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9176; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9177; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9178; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9179; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9180; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9181; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
9182; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
9183; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
9184; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9185; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9186; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9187; GCN-NEXT:    s_setpc_b64 s[30:31]
9188;
9189; GFX7-LABEL: v_fadd_v3bf16:
9190; GFX7:       ; %bb.0:
9191; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9192; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9193; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9194; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9195; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9196; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9197; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9198; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9199; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9200; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9201; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9202; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9203; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9204; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
9205; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
9206; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
9207; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9208; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9209; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9210; GFX7-NEXT:    s_setpc_b64 s[30:31]
9211;
9212; GFX8-LABEL: v_fadd_v3bf16:
9213; GFX8:       ; %bb.0:
9214; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9215; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9216; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
9217; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
9218; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
9219; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
9220; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
9221; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
9222; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9223; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
9224; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
9225; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
9226; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
9227; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
9228; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
9229; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
9230; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9231; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9232; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
9233; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
9234; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
9235; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9236; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
9237; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
9238; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
9239; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
9240; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
9241; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9242; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
9243; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9244; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
9245; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
9246; GFX8-NEXT:    s_setpc_b64 s[30:31]
9247;
9248; GFX9-LABEL: v_fadd_v3bf16:
9249; GFX9:       ; %bb.0:
9250; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9251; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9252; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
9253; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
9254; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
9255; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
9256; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
9257; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
9258; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9259; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
9260; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
9261; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
9262; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
9263; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9264; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9265; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
9266; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
9267; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
9268; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
9269; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9270; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
9271; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
9272; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
9273; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
9274; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9275; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
9276; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
9277; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
9278; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
9279; GFX9-NEXT:    s_setpc_b64 s[30:31]
9280;
9281; GFX10-LABEL: v_fadd_v3bf16:
9282; GFX10:       ; %bb.0:
9283; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9284; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
9285; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
9286; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9287; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9288; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9289; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
9290; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
9291; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
9292; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
9293; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
9294; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
9295; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
9296; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9297; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
9298; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
9299; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9300; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
9301; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
9302; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
9303; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9304; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9305; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9306; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9307; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
9308; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9309; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
9310; GFX10-NEXT:    s_setpc_b64 s[30:31]
9311;
9312; GFX11TRUE16-LABEL: v_fadd_v3bf16:
9313; GFX11TRUE16:       ; %bb.0:
9314; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9315; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
9316; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
9317; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9318; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9319; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9320; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9321; GFX11TRUE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9322; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9323; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9324; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
9325; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
9326; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9327; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
9328; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
9329; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
9330; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9331; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
9332; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
9333; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
9334; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9335; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9336; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9337; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9338; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9339; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
9340; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9341; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9342; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
9343; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
9344;
9345; GFX11FAKE16-LABEL: v_fadd_v3bf16:
9346; GFX11FAKE16:       ; %bb.0:
9347; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9348; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
9349; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
9350; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9351; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9352; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9353; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9354; GFX11FAKE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9355; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9356; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9357; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
9358; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
9359; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9360; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
9361; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
9362; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
9363; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9364; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
9365; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
9366; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
9367; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9368; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9369; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9370; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9371; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9372; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
9373; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9374; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9375; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
9376; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
9377  %op = fadd <3 x bfloat> %a, %b
9378  ret <3 x bfloat> %op
9379}
9380
9381define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
9382; GCN-LABEL: v_fadd_v4bf16:
9383; GCN:       ; %bb.0:
9384; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9385; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9386; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9387; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9388; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9389; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9390; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
9391; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9392; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
9393; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9394; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9395; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9396; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9397; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9398; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9399; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9400; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9401; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
9402; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
9403; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
9404; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
9405; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9406; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9407; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9408; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9409; GCN-NEXT:    s_setpc_b64 s[30:31]
9410;
9411; GFX7-LABEL: v_fadd_v4bf16:
9412; GFX7:       ; %bb.0:
9413; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9414; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9415; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9416; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9417; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9418; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9419; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
9420; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9421; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
9422; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9423; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9424; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9425; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9426; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9427; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9428; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9429; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9430; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
9431; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
9432; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
9433; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
9434; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9435; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9436; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9437; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9438; GFX7-NEXT:    s_setpc_b64 s[30:31]
9439;
9440; GFX8-LABEL: v_fadd_v4bf16:
9441; GFX8:       ; %bb.0:
9442; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9443; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
9444; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
9445; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
9446; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
9447; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
9448; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9449; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9450; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
9451; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
9452; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
9453; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9454; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
9455; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
9456; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
9457; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
9458; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
9459; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
9460; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9461; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
9462; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
9463; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
9464; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
9465; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
9466; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
9467; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9468; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9469; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
9470; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
9471; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
9472; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9473; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
9474; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
9475; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
9476; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
9477; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
9478; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9479; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
9480; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
9481; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9482; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
9483; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
9484; GFX8-NEXT:    s_setpc_b64 s[30:31]
9485;
9486; GFX9-LABEL: v_fadd_v4bf16:
9487; GFX9:       ; %bb.0:
9488; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9489; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
9490; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
9491; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
9492; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9493; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9494; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
9495; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
9496; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
9497; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
9498; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
9499; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9500; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
9501; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
9502; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
9503; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
9504; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9505; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
9506; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
9507; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
9508; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
9509; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9510; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9511; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
9512; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
9513; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
9514; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
9515; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9516; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
9517; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
9518; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
9519; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
9520; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9521; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
9522; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
9523; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
9524; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
9525; GFX9-NEXT:    s_setpc_b64 s[30:31]
9526;
9527; GFX10-LABEL: v_fadd_v4bf16:
9528; GFX10:       ; %bb.0:
9529; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9530; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
9531; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
9532; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9533; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9534; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
9535; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
9536; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
9537; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9538; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9539; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
9540; GFX10-NEXT:    v_add_f32_e32 v3, v7, v6
9541; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
9542; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
9543; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
9544; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9545; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
9546; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
9547; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
9548; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
9549; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9550; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
9551; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9552; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
9553; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9554; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
9555; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9556; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
9557; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9558; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9559; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9560; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9561; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
9562; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9563; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
9564; GFX10-NEXT:    s_setpc_b64 s[30:31]
9565;
9566; GFX11-LABEL: v_fadd_v4bf16:
9567; GFX11:       ; %bb.0:
9568; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9569; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
9570; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
9571; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9572; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9573; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
9574; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
9575; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9576; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
9577; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9578; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
9579; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9580; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
9581; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
9582; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
9583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
9584; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
9585; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
9586; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
9587; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9588; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
9589; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
9590; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
9591; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9592; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9593; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9594; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
9595; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9596; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
9597; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9598; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9599; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9600; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9601; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9602; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9603; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
9604; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9605; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9606; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
9607; GFX11-NEXT:    s_setpc_b64 s[30:31]
9608  %op = fadd <4 x bfloat> %a, %b
9609  ret <4 x bfloat> %op
9610}
9611
9612define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
9613; GCN-LABEL: v_fadd_v8bf16:
9614; GCN:       ; %bb.0:
9615; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9616; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9617; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
9618; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9619; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
9620; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9621; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
9622; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9623; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
9624; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9625; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
9626; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9627; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
9628; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
9629; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
9630; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
9631; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
9632; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
9633; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9634; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
9635; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9636; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
9637; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9638; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
9639; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9640; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
9641; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9642; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
9643; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9644; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
9645; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9646; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
9647; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9648; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
9649; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
9650; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
9651; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
9652; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
9653; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
9654; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
9655; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
9656; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9657; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9658; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9659; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9660; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9661; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9662; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9663; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9664; GCN-NEXT:    s_setpc_b64 s[30:31]
9665;
9666; GFX7-LABEL: v_fadd_v8bf16:
9667; GFX7:       ; %bb.0:
9668; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9669; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9670; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
9671; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9672; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
9673; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9674; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
9675; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9676; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
9677; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9678; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
9679; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9680; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
9681; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
9682; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
9683; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
9684; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
9685; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
9686; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9687; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
9688; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9689; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
9690; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9691; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
9692; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9693; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
9694; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9695; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
9696; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9697; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
9698; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9699; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
9700; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9701; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
9702; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
9703; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
9704; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
9705; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
9706; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
9707; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
9708; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
9709; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9710; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9711; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9712; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9713; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9714; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9715; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9716; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9717; GFX7-NEXT:    s_setpc_b64 s[30:31]
9718;
9719; GFX8-LABEL: v_fadd_v8bf16:
9720; GFX8:       ; %bb.0:
9721; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9722; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
9723; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
9724; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
9725; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
9726; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
9727; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9728; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9729; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
9730; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
9731; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
9732; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
9733; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
9734; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
9735; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
9736; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
9737; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
9738; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
9739; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9740; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
9741; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
9742; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
9743; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
9744; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
9745; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
9746; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9747; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9748; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
9749; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
9750; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
9751; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
9752; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
9753; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
9754; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
9755; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
9756; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
9757; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
9758; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
9759; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
9760; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
9761; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
9762; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
9763; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
9764; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9765; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9766; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
9767; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
9768; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
9769; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
9770; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
9771; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
9772; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
9773; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
9774; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9775; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9776; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
9777; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
9778; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
9779; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
9780; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
9781; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
9782; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9783; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9784; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
9785; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
9786; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
9787; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9788; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
9789; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
9790; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
9791; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
9792; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
9793; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9794; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
9795; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9796; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
9797; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
9798; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
9799; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
9800; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
9801; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
9802; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
9803; GFX8-NEXT:    s_setpc_b64 s[30:31]
9804;
9805; GFX9-LABEL: v_fadd_v8bf16:
9806; GFX9:       ; %bb.0:
9807; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9808; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
9809; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
9810; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
9811; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9812; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9813; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
9814; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
9815; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
9816; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
9817; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
9818; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
9819; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
9820; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
9821; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
9822; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
9823; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9824; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
9825; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
9826; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
9827; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
9828; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9829; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9830; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
9831; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
9832; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
9833; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
9834; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
9835; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
9836; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
9837; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
9838; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
9839; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
9840; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
9841; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
9842; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
9843; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
9844; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9845; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9846; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
9847; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
9848; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
9849; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
9850; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
9851; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
9852; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
9853; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
9854; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9855; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9856; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
9857; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
9858; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
9859; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
9860; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9861; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9862; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
9863; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
9864; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
9865; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
9866; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9867; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
9868; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
9869; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
9870; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
9871; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
9872; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
9873; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
9874; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
9875; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
9876; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
9877; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
9878; GFX9-NEXT:    s_setpc_b64 s[30:31]
9879;
9880; GFX10-LABEL: v_fadd_v8bf16:
9881; GFX10:       ; %bb.0:
9882; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9883; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
9884; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
9885; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
9886; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9887; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
9888; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9889; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
9890; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
9891; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9892; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
9893; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
9894; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
9895; GFX10-NEXT:    v_add_f32_e32 v7, v10, v9
9896; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
9897; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
9898; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
9899; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
9900; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
9901; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
9902; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
9903; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
9904; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9905; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
9906; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
9907; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
9908; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
9909; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
9910; GFX10-NEXT:    v_add_f32_e32 v6, v10, v6
9911; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
9912; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9913; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9914; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
9915; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
9916; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
9917; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
9918; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
9919; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9920; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
9921; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
9922; GFX10-NEXT:    v_add_f32_e32 v5, v15, v13
9923; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
9924; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
9925; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
9926; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
9927; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
9928; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
9929; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
9930; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
9931; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
9932; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
9933; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
9934; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
9935; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
9936; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
9937; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9938; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
9939; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
9940; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
9941; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
9942; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
9943; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
9944; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9945; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
9946; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
9947; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9948; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
9949; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
9950; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
9951; GFX10-NEXT:    s_setpc_b64 s[30:31]
9952;
9953; GFX11-LABEL: v_fadd_v8bf16:
9954; GFX11:       ; %bb.0:
9955; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9956; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
9957; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
9958; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9959; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
9960; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
9961; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9962; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
9963; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
9964; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
9965; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
9966; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
9967; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9968; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
9969; GFX11-NEXT:    v_add_f32_e32 v7, v10, v9
9970; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
9971; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
9972; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9973; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
9974; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
9975; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
9976; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
9977; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9978; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
9979; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
9980; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
9981; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
9982; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
9983; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
9984; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9985; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9986; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
9987; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
9988; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
9989; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
9990; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9991; GFX11-NEXT:    v_add_f32_e32 v6, v10, v6
9992; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
9993; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
9994; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
9995; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
9996; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
9997; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
9998; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
9999; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
10000; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
10001; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10002; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10003; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
10004; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10005; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
10006; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
10007; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
10008; GFX11-NEXT:    v_add_f32_e32 v5, v15, v13
10009; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10010; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
10011; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
10012; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
10013; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
10014; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10015; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
10016; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
10017; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10018; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
10019; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
10020; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
10021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
10022; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
10023; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
10024; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
10025; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
10026; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10027; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
10028; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
10029; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10030; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
10031; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
10032; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10033; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
10034; GFX11-NEXT:    s_setpc_b64 s[30:31]
10035  %op = fadd <8 x bfloat> %a, %b
10036  ret <8 x bfloat> %op
10037}
10038
10039define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
10040; GCN-LABEL: v_fadd_v16bf16:
10041; GCN:       ; %bb.0:
10042; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10043; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
10044; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
10045; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
10046; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10047; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
10048; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
10049; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
10050; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
10051; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10052; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
10053; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
10054; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
10055; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
10056; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10057; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
10058; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
10059; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
10060; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
10061; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
10062; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
10063; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
10064; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
10065; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
10066; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
10067; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
10068; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
10069; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
10070; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
10071; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
10072; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
10073; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
10074; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
10075; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
10076; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10077; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
10078; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10079; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
10080; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10081; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
10082; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
10083; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
10084; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10085; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
10086; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32
10087; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
10088; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10089; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
10090; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10091; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
10092; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10093; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
10094; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10095; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
10096; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10097; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
10098; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10099; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
10100; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10101; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
10102; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10103; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
10104; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10105; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
10106; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10107; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
10108; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10109; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10110; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
10111; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
10112; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
10113; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
10114; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
10115; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
10116; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
10117; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
10118; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
10119; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
10120; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
10121; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10122; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10123; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10124; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10125; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10126; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10127; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10128; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10129; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10130; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10131; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10132; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10133; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10134; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10135; GCN-NEXT:    s_waitcnt vmcnt(0)
10136; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
10137; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
10138; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
10139; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10140; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10141; GCN-NEXT:    s_setpc_b64 s[30:31]
10142;
10143; GFX7-LABEL: v_fadd_v16bf16:
10144; GFX7:       ; %bb.0:
10145; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10146; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
10147; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
10148; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
10149; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10150; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
10151; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32
10152; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
10153; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
10154; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
10155; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10156; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
10157; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
10158; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
10159; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
10160; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
10161; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
10162; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
10163; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
10164; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
10165; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
10166; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
10167; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
10168; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
10169; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
10170; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
10171; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
10172; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
10173; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
10174; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
10175; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
10176; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
10177; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
10178; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10179; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
10180; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10181; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
10182; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10183; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
10184; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
10185; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10186; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
10187; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10188; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
10189; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10190; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
10191; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10192; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
10193; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10194; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
10195; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10196; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
10197; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10198; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10199; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
10200; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10201; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
10202; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10203; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
10204; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10205; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
10206; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10207; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
10208; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10209; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
10210; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10211; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
10212; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
10213; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
10214; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
10215; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
10216; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
10217; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
10218; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
10219; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
10220; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
10221; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
10222; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
10223; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
10224; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10225; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10226; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10227; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10228; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10229; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10230; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10231; GFX7-NEXT:    s_waitcnt vmcnt(0)
10232; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v27
10233; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
10234; GFX7-NEXT:    v_add_f32_e32 v15, v15, v22
10235; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10236; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10237; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10238; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10239; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10240; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10241; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10242; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10243; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10244; GFX7-NEXT:    s_setpc_b64 s[30:31]
10245;
10246; GFX8-LABEL: v_fadd_v16bf16:
10247; GFX8:       ; %bb.0:
10248; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10249; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
10250; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
10251; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
10252; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
10253; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
10254; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
10255; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10256; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10257; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10258; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
10259; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
10260; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
10261; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
10262; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
10263; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
10264; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
10265; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
10266; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
10267; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
10268; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
10269; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
10270; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
10271; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
10272; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
10273; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10274; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10275; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10276; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
10277; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
10278; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
10279; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
10280; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
10281; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
10282; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
10283; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
10284; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
10285; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
10286; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
10287; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
10288; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
10289; GFX8-NEXT:    v_bfe_u32 v17, v14, 16, 1
10290; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v14
10291; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10292; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10293; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10294; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
10295; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
10296; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
10297; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
10298; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
10299; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
10300; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
10301; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
10302; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10303; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
10304; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
10305; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
10306; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
10307; GFX8-NEXT:    v_bfe_u32 v17, v13, 16, 1
10308; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
10309; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10310; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10311; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10312; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
10313; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
10314; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
10315; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
10316; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
10317; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
10318; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
10319; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
10320; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
10321; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
10322; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
10323; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
10324; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
10325; GFX8-NEXT:    v_bfe_u32 v17, v12, 16, 1
10326; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v12
10327; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10328; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10329; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10330; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
10331; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
10332; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
10333; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
10334; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
10335; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
10336; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
10337; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
10338; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10339; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
10340; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
10341; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
10342; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
10343; GFX8-NEXT:    v_bfe_u32 v17, v11, 16, 1
10344; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
10345; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10346; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10347; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10348; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
10349; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
10350; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
10351; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
10352; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
10353; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
10354; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
10355; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
10356; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10357; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
10358; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
10359; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
10360; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
10361; GFX8-NEXT:    v_bfe_u32 v17, v10, 16, 1
10362; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v10
10363; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10364; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10365; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10366; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
10367; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
10368; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
10369; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
10370; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
10371; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
10372; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
10373; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
10374; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10375; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
10376; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
10377; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
10378; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
10379; GFX8-NEXT:    v_bfe_u32 v17, v9, 16, 1
10380; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v9
10381; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10382; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10383; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
10384; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
10385; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
10386; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
10387; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
10388; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
10389; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
10390; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
10391; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
10392; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
10393; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
10394; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
10395; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
10396; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
10397; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
10398; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10399; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10400; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10401; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
10402; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
10403; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
10404; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
10405; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
10406; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
10407; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
10408; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
10409; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
10410; GFX8-NEXT:    s_setpc_b64 s[30:31]
10411;
10412; GFX9-LABEL: v_fadd_v16bf16:
10413; GFX9:       ; %bb.0:
10414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10415; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
10416; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
10417; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
10418; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10419; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10420; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
10421; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
10422; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
10423; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
10424; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
10425; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
10426; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
10427; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
10428; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
10429; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
10430; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
10431; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
10432; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
10433; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
10434; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
10435; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10436; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10437; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
10438; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
10439; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
10440; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
10441; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
10442; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
10443; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
10444; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
10445; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
10446; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
10447; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
10448; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
10449; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
10450; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
10451; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10452; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10453; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
10454; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
10455; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
10456; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
10457; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
10458; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
10459; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
10460; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
10461; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
10462; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10463; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
10464; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
10465; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
10466; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
10467; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10468; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10469; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
10470; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
10471; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
10472; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
10473; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
10474; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
10475; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
10476; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
10477; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
10478; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
10479; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
10480; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
10481; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
10482; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
10483; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10484; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10485; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
10486; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
10487; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
10488; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
10489; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
10490; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
10491; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
10492; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
10493; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
10494; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10495; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
10496; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
10497; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
10498; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
10499; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10500; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10501; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
10502; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
10503; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
10504; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
10505; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
10506; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
10507; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
10508; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
10509; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
10510; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10511; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
10512; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
10513; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
10514; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
10515; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10516; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10517; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
10518; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
10519; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
10520; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
10521; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
10522; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
10523; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
10524; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
10525; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
10526; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10527; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
10528; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
10529; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
10530; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
10531; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10532; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10533; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
10534; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
10535; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
10536; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
10537; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
10538; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
10539; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
10540; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
10541; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
10542; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
10543; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
10544; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
10545; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
10546; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
10547; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
10548; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
10549; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
10550; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
10551; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
10552; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
10553; GFX9-NEXT:    s_setpc_b64 s[30:31]
10554;
10555; GFX10-LABEL: v_fadd_v16bf16:
10556; GFX10:       ; %bb.0:
10557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10558; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
10559; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
10560; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10561; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10562; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
10563; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10564; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
10565; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
10566; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
10567; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10568; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
10569; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
10570; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
10571; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
10572; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
10573; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
10574; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
10575; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
10576; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
10577; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
10578; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
10579; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
10580; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
10581; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
10582; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
10583; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10584; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10585; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
10586; GFX10-NEXT:    v_bfe_u32 v18, v6, 16, 1
10587; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10588; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
10589; GFX10-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
10590; GFX10-NEXT:    v_add_f32_e32 v17, v20, v19
10591; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
10592; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
10593; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
10594; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
10595; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
10596; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
10597; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
10598; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
10599; GFX10-NEXT:    v_bfe_u32 v21, v5, 16, 1
10600; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10601; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10602; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
10603; GFX10-NEXT:    v_add_f32_e32 v13, v19, v18
10604; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
10605; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
10606; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
10607; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
10608; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
10609; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
10610; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
10611; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10612; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10613; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
10614; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
10615; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
10616; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10617; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10618; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
10619; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10620; GFX10-NEXT:    v_add_f32_e32 v12, v18, v12
10621; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
10622; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
10623; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
10624; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
10625; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
10626; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10627; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
10628; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
10629; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
10630; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
10631; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10632; GFX10-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
10633; GFX10-NEXT:    v_add_f32_e32 v18, v19, v18
10634; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10635; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
10636; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
10637; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
10638; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
10639; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
10640; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10641; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10642; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
10643; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
10644; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
10645; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10646; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10647; GFX10-NEXT:    v_bfe_u32 v19, v2, 16, 1
10648; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
10649; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
10650; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10651; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
10652; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
10653; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
10654; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10655; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
10656; GFX10-NEXT:    v_add_f32_e32 v19, v22, v20
10657; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
10658; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
10659; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10660; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10661; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
10662; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
10663; GFX10-NEXT:    v_add_f32_e32 v9, v22, v20
10664; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
10665; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
10666; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
10667; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
10668; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
10669; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
10670; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
10671; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
10672; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
10673; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10674; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
10675; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
10676; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
10677; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
10678; GFX10-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
10679; GFX10-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
10680; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10681; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
10682; GFX10-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
10683; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
10684; GFX10-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10685; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
10686; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10687; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10688; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
10689; GFX10-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10690; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
10691; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
10692; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10693; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
10694; GFX10-NEXT:    s_setpc_b64 s[30:31]
10695;
10696; GFX11-LABEL: v_fadd_v16bf16:
10697; GFX11:       ; %bb.0:
10698; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10699; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
10700; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
10701; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
10702; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
10703; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10704; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
10705; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
10706; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10707; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
10708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10709; GFX11-NEXT:    v_add_f32_e32 v17, v18, v17
10710; GFX11-NEXT:    v_add_f32_e32 v6, v6, v14
10711; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
10712; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10713; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
10714; GFX11-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
10715; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
10716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10717; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
10718; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
10719; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
10720; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
10721; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10722; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
10723; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
10724; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
10725; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
10726; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
10727; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10728; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10729; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
10730; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
10731; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
10732; GFX11-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
10733; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
10734; GFX11-NEXT:    v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
10735; GFX11-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
10736; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
10737; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
10738; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10739; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
10740; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10741; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
10742; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
10743; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
10744; GFX11-NEXT:    v_add_f32_e32 v4, v4, v12
10745; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
10746; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
10747; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
10748; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
10749; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10750; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
10751; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
10752; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
10753; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
10754; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
10755; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
10756; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10757; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
10758; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
10759; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
10760; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10761; GFX11-NEXT:    v_add_f32_e32 v12, v18, v12
10762; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10763; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
10764; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
10765; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
10766; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
10767; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10768; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10769; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
10770; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
10771; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
10772; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
10773; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
10774; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
10775; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10776; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
10777; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
10778; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
10779; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
10780; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10781; GFX11-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
10782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
10783; GFX11-NEXT:    v_add_f32_e32 v18, v19, v18
10784; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10785; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
10786; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
10787; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
10788; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
10789; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
10790; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10791; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
10792; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
10793; GFX11-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
10794; GFX11-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
10795; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
10796; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
10797; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10798; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
10799; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
10800; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10801; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10802; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
10803; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
10804; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
10805; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
10806; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
10807; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10808; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
10809; GFX11-NEXT:    v_add_f32_e32 v19, v22, v20
10810; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
10811; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
10812; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10813; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
10814; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
10815; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
10816; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
10817; GFX11-NEXT:    v_bfe_u32 v23, v19, 16, 1
10818; GFX11-NEXT:    v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9
10819; GFX11-NEXT:    v_add_f32_e32 v9, v22, v20
10820; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10821; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
10822; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
10823; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
10824; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
10825; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
10826; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
10827; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10828; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
10829; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
10830; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
10831; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
10832; GFX11-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
10833; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10834; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10835; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
10836; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
10837; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10838; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
10839; GFX11-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10840; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
10841; GFX11-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10842; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10843; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10844; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
10845; GFX11-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10846; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
10847; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
10848; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10849; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10850; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
10851; GFX11-NEXT:    s_setpc_b64 s[30:31]
10852  %op = fadd <16 x bfloat> %a, %b
10853  ret <16 x bfloat> %op
10854}
10855
10856define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
10857; GCN-LABEL: v_fadd_v32bf16:
10858; GCN:       ; %bb.0:
10859; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10860; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
10861; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
10862; GCN-NEXT:    s_waitcnt vmcnt(1)
10863; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
10864; GCN-NEXT:    s_waitcnt vmcnt(0)
10865; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
10866; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10867; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
10868; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
10869; GCN-NEXT:    v_add_f32_e32 v31, v31, v32
10870; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
10871; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
10872; GCN-NEXT:    s_waitcnt vmcnt(0)
10873; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10874; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10875; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
10876; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
10877; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
10878; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
10879; GCN-NEXT:    s_waitcnt vmcnt(0)
10880; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10881; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10882; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
10883; GCN-NEXT:    v_add_f32_e32 v29, v29, v32
10884; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
10885; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
10886; GCN-NEXT:    s_waitcnt vmcnt(0)
10887; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10888; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10889; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
10890; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
10891; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
10892; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
10893; GCN-NEXT:    s_waitcnt vmcnt(0)
10894; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10895; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10896; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
10897; GCN-NEXT:    v_add_f32_e32 v27, v27, v32
10898; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
10899; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
10900; GCN-NEXT:    s_waitcnt vmcnt(0)
10901; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10902; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10903; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
10904; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
10905; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
10906; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
10907; GCN-NEXT:    s_waitcnt vmcnt(0)
10908; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10909; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10910; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
10911; GCN-NEXT:    v_add_f32_e32 v25, v25, v32
10912; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
10913; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
10914; GCN-NEXT:    s_waitcnt vmcnt(0)
10915; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10916; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10917; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
10918; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
10919; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
10920; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
10921; GCN-NEXT:    s_waitcnt vmcnt(0)
10922; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10923; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10924; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
10925; GCN-NEXT:    v_add_f32_e32 v23, v23, v32
10926; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
10927; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
10928; GCN-NEXT:    s_waitcnt vmcnt(0)
10929; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10930; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10931; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
10932; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
10933; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
10934; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
10935; GCN-NEXT:    s_waitcnt vmcnt(0)
10936; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10937; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10938; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
10939; GCN-NEXT:    v_add_f32_e32 v21, v21, v32
10940; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
10941; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
10942; GCN-NEXT:    s_waitcnt vmcnt(0)
10943; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10944; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10945; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
10946; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
10947; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
10948; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
10949; GCN-NEXT:    s_waitcnt vmcnt(0)
10950; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10951; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10952; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
10953; GCN-NEXT:    v_add_f32_e32 v19, v19, v32
10954; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
10955; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
10956; GCN-NEXT:    s_waitcnt vmcnt(0)
10957; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10958; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10959; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
10960; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
10961; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
10962; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
10963; GCN-NEXT:    s_waitcnt vmcnt(0)
10964; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10965; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10966; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
10967; GCN-NEXT:    v_add_f32_e32 v17, v17, v32
10968; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
10969; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
10970; GCN-NEXT:    s_waitcnt vmcnt(0)
10971; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10972; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10973; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
10974; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
10975; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
10976; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
10977; GCN-NEXT:    s_waitcnt vmcnt(0)
10978; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10979; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10980; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
10981; GCN-NEXT:    v_add_f32_e32 v15, v15, v32
10982; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
10983; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
10984; GCN-NEXT:    s_waitcnt vmcnt(0)
10985; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10986; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10987; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
10988; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
10989; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
10990; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
10991; GCN-NEXT:    s_waitcnt vmcnt(0)
10992; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
10993; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
10994; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
10995; GCN-NEXT:    v_add_f32_e32 v13, v13, v32
10996; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
10997; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
10998; GCN-NEXT:    s_waitcnt vmcnt(0)
10999; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11000; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11001; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
11002; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
11003; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
11004; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11005; GCN-NEXT:    s_waitcnt vmcnt(0)
11006; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11007; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11008; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
11009; GCN-NEXT:    v_add_f32_e32 v11, v11, v32
11010; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
11011; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11012; GCN-NEXT:    s_waitcnt vmcnt(0)
11013; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11014; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11015; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
11016; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
11017; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
11018; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11019; GCN-NEXT:    s_waitcnt vmcnt(0)
11020; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11021; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11022; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
11023; GCN-NEXT:    v_add_f32_e32 v9, v9, v32
11024; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
11025; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11026; GCN-NEXT:    s_waitcnt vmcnt(0)
11027; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11028; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11029; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
11030; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
11031; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
11032; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11033; GCN-NEXT:    s_waitcnt vmcnt(0)
11034; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11035; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11036; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
11037; GCN-NEXT:    v_add_f32_e32 v7, v7, v32
11038; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
11039; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11040; GCN-NEXT:    s_waitcnt vmcnt(0)
11041; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11042; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11043; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
11044; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
11045; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
11046; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11047; GCN-NEXT:    s_waitcnt vmcnt(0)
11048; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11049; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11050; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
11051; GCN-NEXT:    v_add_f32_e32 v5, v5, v32
11052; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11053; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11054; GCN-NEXT:    s_waitcnt vmcnt(0)
11055; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11056; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11057; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
11058; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
11059; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11060; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11061; GCN-NEXT:    s_waitcnt vmcnt(0)
11062; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11063; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11064; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
11065; GCN-NEXT:    v_add_f32_e32 v3, v3, v32
11066; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11067; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11068; GCN-NEXT:    s_waitcnt vmcnt(0)
11069; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11070; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11071; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
11072; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
11073; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11074; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11075; GCN-NEXT:    s_waitcnt vmcnt(0)
11076; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11077; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11078; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
11079; GCN-NEXT:    v_add_f32_e32 v1, v1, v32
11080; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11081; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11082; GCN-NEXT:    s_waitcnt vmcnt(0)
11083; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
11084; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11085; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
11086; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11087; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11088; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11089; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11090; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11091; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11092; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11093; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11094; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11095; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11096; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11097; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11098; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
11099; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
11100; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
11101; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
11102; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
11103; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
11104; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
11105; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
11106; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
11107; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
11108; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
11109; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
11110; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
11111; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
11112; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
11113; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
11114; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11115; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11116; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11117; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
11118; GCN-NEXT:    s_setpc_b64 s[30:31]
11119;
11120; GFX7-LABEL: v_fadd_v32bf16:
11121; GFX7:       ; %bb.0:
11122; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11123; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
11124; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
11125; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
11126; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11127; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
11128; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11129; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
11130; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11131; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
11132; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
11133; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
11134; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
11135; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
11136; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
11137; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
11138; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
11139; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
11140; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
11141; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
11142; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
11143; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
11144; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
11145; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
11146; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
11147; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
11148; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
11149; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
11150; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
11151; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
11152; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
11153; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
11154; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
11155; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
11156; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
11157; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
11158; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
11159; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
11160; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
11161; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
11162; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
11163; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
11164; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11165; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
11166; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11167; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
11168; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11169; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
11170; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11171; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
11172; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11173; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
11174; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11175; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
11176; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11177; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11178; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11179; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11180; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11181; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11182; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11183; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11184; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11185; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11186; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11187; GFX7-NEXT:    s_waitcnt vmcnt(1)
11188; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
11189; GFX7-NEXT:    s_waitcnt vmcnt(0)
11190; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11191; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11192; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
11193; GFX7-NEXT:    v_add_f32_e32 v31, v31, v32
11194; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
11195; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
11196; GFX7-NEXT:    s_waitcnt vmcnt(0)
11197; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11198; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11199; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
11200; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
11201; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11202; GFX7-NEXT:    s_waitcnt vmcnt(0)
11203; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11204; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11205; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
11206; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
11207; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11208; GFX7-NEXT:    s_waitcnt vmcnt(0)
11209; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11210; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11211; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
11212; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
11213; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11214; GFX7-NEXT:    s_waitcnt vmcnt(0)
11215; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11216; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11217; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
11218; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
11219; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
11220; GFX7-NEXT:    s_waitcnt vmcnt(0)
11221; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11222; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11223; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
11224; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
11225; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
11226; GFX7-NEXT:    s_waitcnt vmcnt(0)
11227; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11228; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11229; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
11230; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
11231; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
11232; GFX7-NEXT:    s_waitcnt vmcnt(0)
11233; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11234; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11235; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
11236; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
11237; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
11238; GFX7-NEXT:    s_waitcnt vmcnt(0)
11239; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11240; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11241; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
11242; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
11243; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
11244; GFX7-NEXT:    s_waitcnt vmcnt(0)
11245; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11246; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11247; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
11248; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
11249; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
11250; GFX7-NEXT:    s_waitcnt vmcnt(0)
11251; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11252; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11253; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
11254; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
11255; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
11256; GFX7-NEXT:    s_waitcnt vmcnt(0)
11257; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11258; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11259; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
11260; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
11261; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
11262; GFX7-NEXT:    s_waitcnt vmcnt(0)
11263; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11264; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11265; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
11266; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
11267; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
11268; GFX7-NEXT:    s_waitcnt vmcnt(0)
11269; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11270; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11271; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
11272; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
11273; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
11274; GFX7-NEXT:    s_waitcnt vmcnt(0)
11275; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11276; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11277; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
11278; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
11279; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
11280; GFX7-NEXT:    s_waitcnt vmcnt(0)
11281; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11282; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11283; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
11284; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
11285; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
11286; GFX7-NEXT:    s_waitcnt vmcnt(0)
11287; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11288; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11289; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
11290; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
11291; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
11292; GFX7-NEXT:    s_waitcnt vmcnt(0)
11293; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11294; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11295; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
11296; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
11297; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
11298; GFX7-NEXT:    s_waitcnt vmcnt(0)
11299; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11300; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11301; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
11302; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
11303; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
11304; GFX7-NEXT:    s_waitcnt vmcnt(0)
11305; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11306; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11307; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
11308; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
11309; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
11310; GFX7-NEXT:    s_waitcnt vmcnt(0)
11311; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11312; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11313; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
11314; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
11315; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11316; GFX7-NEXT:    s_waitcnt vmcnt(0)
11317; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11318; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11319; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
11320; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
11321; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11322; GFX7-NEXT:    s_waitcnt vmcnt(0)
11323; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11324; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11325; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
11326; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
11327; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11328; GFX7-NEXT:    s_waitcnt vmcnt(0)
11329; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11330; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11331; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
11332; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
11333; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11334; GFX7-NEXT:    s_waitcnt vmcnt(0)
11335; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11336; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11337; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
11338; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
11339; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11340; GFX7-NEXT:    s_waitcnt vmcnt(0)
11341; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11342; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11343; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
11344; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
11345; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11346; GFX7-NEXT:    s_waitcnt vmcnt(0)
11347; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11348; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11349; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
11350; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
11351; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11352; GFX7-NEXT:    s_waitcnt vmcnt(0)
11353; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11354; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11355; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
11356; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
11357; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11358; GFX7-NEXT:    s_waitcnt vmcnt(0)
11359; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11360; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11361; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
11362; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
11363; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11364; GFX7-NEXT:    s_waitcnt vmcnt(0)
11365; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11366; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11367; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
11368; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
11369; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11370; GFX7-NEXT:    s_waitcnt vmcnt(0)
11371; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11372; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11373; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
11374; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
11375; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11376; GFX7-NEXT:    s_waitcnt vmcnt(0)
11377; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
11378; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
11379; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
11380; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11381; GFX7-NEXT:    s_setpc_b64 s[30:31]
11382;
11383; GFX8-LABEL: v_fadd_v32bf16:
11384; GFX8:       ; %bb.0:
11385; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11386; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
11387; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
11388; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
11389; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
11390; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
11391; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
11392; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11393; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
11394; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
11395; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
11396; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
11397; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
11398; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
11399; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
11400; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
11401; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
11402; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
11403; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
11404; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
11405; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
11406; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
11407; GFX8-NEXT:    v_add_f32_e32 v32, v32, v30
11408; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32
11409; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
11410; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
11411; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11412; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
11413; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
11414; GFX8-NEXT:    v_bfe_u32 v29, v13, 16, 1
11415; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
11416; GFX8-NEXT:    v_alignbit_b32 v14, v14, v31, 16
11417; GFX8-NEXT:    s_waitcnt vmcnt(0)
11418; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
11419; GFX8-NEXT:    v_add_f32_e32 v33, v33, v34
11420; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11421; GFX8-NEXT:    v_add_f32_e32 v30, v15, v30
11422; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
11423; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
11424; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
11425; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
11426; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
11427; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
11428; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
11429; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
11430; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11431; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
11432; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
11433; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
11434; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
11435; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
11436; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11437; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
11438; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
11439; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
11440; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
11441; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
11442; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
11443; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
11444; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
11445; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
11446; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
11447; GFX8-NEXT:    v_add_f32_e32 v29, v33, v29
11448; GFX8-NEXT:    v_bfe_u32 v33, v29, 16, 1
11449; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
11450; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11451; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
11452; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11453; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
11454; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
11455; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
11456; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
11457; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
11458; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
11459; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
11460; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
11461; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
11462; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
11463; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
11464; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
11465; GFX8-NEXT:    v_add_f32_e32 v28, v33, v28
11466; GFX8-NEXT:    v_bfe_u32 v33, v28, 16, 1
11467; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
11468; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
11469; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11470; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11471; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
11472; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
11473; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
11474; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
11475; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
11476; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
11477; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
11478; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
11479; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
11480; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
11481; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
11482; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
11483; GFX8-NEXT:    v_add_f32_e32 v27, v33, v27
11484; GFX8-NEXT:    v_bfe_u32 v33, v27, 16, 1
11485; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
11486; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
11487; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11488; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11489; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
11490; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
11491; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
11492; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
11493; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
11494; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
11495; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
11496; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
11497; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
11498; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
11499; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
11500; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
11501; GFX8-NEXT:    v_add_f32_e32 v26, v33, v26
11502; GFX8-NEXT:    v_bfe_u32 v33, v26, 16, 1
11503; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
11504; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
11505; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11506; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11507; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
11508; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
11509; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
11510; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
11511; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
11512; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
11513; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
11514; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
11515; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
11516; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
11517; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
11518; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
11519; GFX8-NEXT:    v_add_f32_e32 v25, v33, v25
11520; GFX8-NEXT:    v_bfe_u32 v33, v25, 16, 1
11521; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
11522; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
11523; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11524; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11525; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
11526; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
11527; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
11528; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
11529; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
11530; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
11531; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
11532; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
11533; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
11534; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
11535; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
11536; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
11537; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
11538; GFX8-NEXT:    v_bfe_u32 v33, v24, 16, 1
11539; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
11540; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
11541; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11542; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11543; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
11544; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
11545; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
11546; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
11547; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
11548; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
11549; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
11550; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
11551; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
11552; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
11553; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
11554; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
11555; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
11556; GFX8-NEXT:    v_bfe_u32 v33, v23, 16, 1
11557; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
11558; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
11559; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11560; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11561; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
11562; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
11563; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
11564; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
11565; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
11566; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
11567; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
11568; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
11569; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
11570; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
11571; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
11572; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
11573; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
11574; GFX8-NEXT:    v_bfe_u32 v33, v22, 16, 1
11575; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
11576; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
11577; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11578; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11579; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
11580; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
11581; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
11582; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
11583; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
11584; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
11585; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
11586; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
11587; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11588; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
11589; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
11590; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
11591; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
11592; GFX8-NEXT:    v_bfe_u32 v33, v21, 16, 1
11593; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
11594; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
11595; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11596; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11597; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
11598; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
11599; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
11600; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
11601; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
11602; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
11603; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
11604; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
11605; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
11606; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
11607; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
11608; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
11609; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
11610; GFX8-NEXT:    v_bfe_u32 v33, v20, 16, 1
11611; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
11612; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
11613; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11614; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11615; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
11616; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
11617; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
11618; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
11619; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
11620; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
11621; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
11622; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
11623; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
11624; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
11625; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
11626; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
11627; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
11628; GFX8-NEXT:    v_bfe_u32 v33, v19, 16, 1
11629; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
11630; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
11631; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11632; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11633; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
11634; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
11635; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
11636; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
11637; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
11638; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
11639; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
11640; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
11641; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11642; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
11643; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
11644; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
11645; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
11646; GFX8-NEXT:    v_bfe_u32 v33, v18, 16, 1
11647; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
11648; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
11649; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11650; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11651; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
11652; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
11653; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
11654; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
11655; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
11656; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
11657; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
11658; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
11659; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
11660; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
11661; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
11662; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
11663; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
11664; GFX8-NEXT:    v_bfe_u32 v33, v17, 16, 1
11665; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
11666; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
11667; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11668; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
11669; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
11670; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
11671; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
11672; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
11673; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
11674; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
11675; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
11676; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
11677; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
11678; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
11679; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
11680; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
11681; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11682; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11683; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
11684; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11685; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
11686; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
11687; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
11688; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
11689; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
11690; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
11691; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
11692; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
11693; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
11694; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
11695; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
11696; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
11697; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
11698; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
11699; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
11700; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
11701; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
11702; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
11703; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
11704; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
11705; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
11706; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
11707; GFX8-NEXT:    v_alignbit_b32 v13, v13, v32, 16
11708; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
11709; GFX8-NEXT:    s_setpc_b64 s[30:31]
11710;
11711; GFX9-LABEL: v_fadd_v32bf16:
11712; GFX9:       ; %bb.0:
11713; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11714; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
11715; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
11716; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
11717; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
11718; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
11719; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
11720; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
11721; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
11722; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
11723; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
11724; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
11725; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
11726; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
11727; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
11728; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
11729; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
11730; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
11731; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
11732; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
11733; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
11734; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11735; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
11736; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
11737; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
11738; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
11739; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
11740; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
11741; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
11742; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
11743; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
11744; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
11745; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
11746; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
11747; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
11748; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
11749; GFX9-NEXT:    v_add_f32_e32 v32, v32, v29
11750; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
11751; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
11752; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
11753; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11754; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
11755; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
11756; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
11757; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
11758; GFX9-NEXT:    s_waitcnt vmcnt(0)
11759; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
11760; GFX9-NEXT:    v_add_f32_e32 v33, v33, v34
11761; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
11762; GFX9-NEXT:    v_add_f32_e32 v29, v15, v29
11763; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
11764; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
11765; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
11766; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
11767; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
11768; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
11769; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
11770; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
11771; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
11772; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
11773; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
11774; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
11775; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
11776; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
11777; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
11778; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
11779; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
11780; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
11781; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
11782; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
11783; GFX9-NEXT:    v_add_f32_e32 v28, v33, v28
11784; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
11785; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
11786; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
11787; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
11788; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
11789; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
11790; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
11791; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
11792; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
11793; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
11794; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
11795; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
11796; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
11797; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
11798; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
11799; GFX9-NEXT:    v_add_f32_e32 v27, v33, v27
11800; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
11801; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
11802; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
11803; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
11804; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
11805; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
11806; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
11807; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
11808; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
11809; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
11810; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
11811; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
11812; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
11813; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
11814; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
11815; GFX9-NEXT:    v_add_f32_e32 v26, v33, v26
11816; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
11817; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
11818; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
11819; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
11820; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
11821; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
11822; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
11823; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
11824; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
11825; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
11826; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
11827; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
11828; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
11829; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
11830; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
11831; GFX9-NEXT:    v_add_f32_e32 v25, v33, v25
11832; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
11833; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
11834; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
11835; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
11836; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
11837; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
11838; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
11839; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
11840; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
11841; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
11842; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
11843; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
11844; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
11845; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
11846; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
11847; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
11848; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
11849; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
11850; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
11851; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
11852; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
11853; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
11854; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
11855; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
11856; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
11857; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
11858; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
11859; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
11860; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
11861; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
11862; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
11863; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
11864; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
11865; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
11866; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
11867; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
11868; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
11869; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
11870; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
11871; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
11872; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
11873; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
11874; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
11875; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
11876; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
11877; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
11878; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
11879; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
11880; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
11881; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
11882; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
11883; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
11884; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
11885; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
11886; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
11887; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
11888; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
11889; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
11890; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
11891; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11892; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
11893; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
11894; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
11895; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
11896; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
11897; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
11898; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
11899; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
11900; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
11901; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
11902; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
11903; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
11904; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
11905; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
11906; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
11907; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
11908; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
11909; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
11910; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
11911; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
11912; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
11913; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
11914; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
11915; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
11916; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
11917; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
11918; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
11919; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
11920; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
11921; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
11922; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
11923; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
11924; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
11925; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
11926; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
11927; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
11928; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
11929; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11930; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
11931; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
11932; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
11933; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
11934; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
11935; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
11936; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
11937; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
11938; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
11939; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11940; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
11941; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
11942; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
11943; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
11944; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
11945; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11946; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
11947; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
11948; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
11949; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
11950; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
11951; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
11952; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
11953; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
11954; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
11955; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
11956; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
11957; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
11958; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
11959; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
11960; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
11961; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11962; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
11963; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
11964; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
11965; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
11966; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
11967; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
11968; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
11969; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
11970; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
11971; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
11972; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
11973; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
11974; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
11975; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
11976; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
11977; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
11978; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
11979; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
11980; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
11981; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
11982; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
11983; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
11984; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
11985; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
11986; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
11987; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
11988; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
11989; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
11990; GFX9-NEXT:    s_setpc_b64 s[30:31]
11991;
11992; GFX10-LABEL: v_fadd_v32bf16:
11993; GFX10:       ; %bb.0:
11994; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11995; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
11996; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
11997; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
11998; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
11999; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
12000; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
12001; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
12002; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
12003; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
12004; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
12005; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
12006; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
12007; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
12008; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
12009; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
12010; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
12011; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
12012; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
12013; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
12014; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
12015; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
12016; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
12017; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v6
12018; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
12019; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
12020; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
12021; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
12022; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
12023; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
12024; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
12025; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
12026; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
12027; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
12028; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
12029; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
12030; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
12031; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
12032; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
12033; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
12034; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
12035; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
12036; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
12037; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
12038; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
12039; GFX10-NEXT:    v_add_f32_e32 v27, v50, v27
12040; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v0
12041; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
12042; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12043; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
12044; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
12045; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
12046; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
12047; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
12048; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
12049; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
12050; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v18
12051; GFX10-NEXT:    v_add_f32_e32 v29, v38, v29
12052; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v2
12053; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
12054; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12055; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
12056; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
12057; GFX10-NEXT:    v_add_f32_e32 v28, v48, v28
12058; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
12059; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
12060; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12061; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
12062; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
12063; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
12064; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
12065; GFX10-NEXT:    v_add_f32_e32 v34, v34, v51
12066; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
12067; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
12068; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
12069; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
12070; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
12071; GFX10-NEXT:    v_add_f32_e32 v30, v36, v30
12072; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v3
12073; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
12074; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12075; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
12076; GFX10-NEXT:    v_add_f32_e32 v18, v48, v23
12077; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
12078; GFX10-NEXT:    v_add_f32_e32 v17, v50, v22
12079; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v33
12080; GFX10-NEXT:    v_bfe_u32 v23, v14, 16, 1
12081; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
12082; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
12083; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
12084; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
12085; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
12086; GFX10-NEXT:    v_add_f32_e32 v20, v36, v25
12087; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
12088; GFX10-NEXT:    v_add_f32_e32 v19, v38, v24
12089; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v14
12090; GFX10-NEXT:    v_bfe_u32 v25, v35, 16, 1
12091; GFX10-NEXT:    v_add3_u32 v23, v23, v14, 0x7fff
12092; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
12093; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
12094; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
12095; GFX10-NEXT:    v_add_f32_e32 v21, v51, v26
12096; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v35
12097; GFX10-NEXT:    v_bfe_u32 v36, v13, 16, 1
12098; GFX10-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
12099; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
12100; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
12101; GFX10-NEXT:    v_or_b32_e32 v38, 0x400000, v13
12102; GFX10-NEXT:    v_bfe_u32 v48, v37, 16, 1
12103; GFX10-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
12104; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v37
12105; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
12106; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
12107; GFX10-NEXT:    v_bfe_u32 v51, v12, 16, 1
12108; GFX10-NEXT:    v_add3_u32 v48, v48, v37, 0x7fff
12109; GFX10-NEXT:    v_or_b32_e32 v33, 0x400000, v12
12110; GFX10-NEXT:    v_bfe_u32 v22, v39, 16, 1
12111; GFX10-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc_lo
12112; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
12113; GFX10-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
12114; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v39
12115; GFX10-NEXT:    v_bfe_u32 v24, v11, 16, 1
12116; GFX10-NEXT:    v_add3_u32 v22, v22, v39, 0x7fff
12117; GFX10-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc_lo
12118; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
12119; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v11
12120; GFX10-NEXT:    v_bfe_u32 v26, v49, 16, 1
12121; GFX10-NEXT:    v_add3_u32 v24, v24, v11, 0x7fff
12122; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v49
12123; GFX10-NEXT:    v_cndmask_b32_e32 v33, v51, v33, vcc_lo
12124; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
12125; GFX10-NEXT:    v_bfe_u32 v38, v10, 16, 1
12126; GFX10-NEXT:    v_add3_u32 v26, v26, v49, 0x7fff
12127; GFX10-NEXT:    v_or_b32_e32 v37, 0x400000, v10
12128; GFX10-NEXT:    v_bfe_u32 v50, v34, 16, 1
12129; GFX10-NEXT:    v_cndmask_b32_e32 v14, v22, v14, vcc_lo
12130; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
12131; GFX10-NEXT:    v_add3_u32 v38, v38, v10, 0x7fff
12132; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v34
12133; GFX10-NEXT:    v_bfe_u32 v51, v9, 16, 1
12134; GFX10-NEXT:    v_add3_u32 v50, v50, v34, 0x7fff
12135; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v35, vcc_lo
12136; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
12137; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v9
12138; GFX10-NEXT:    v_bfe_u32 v22, v30, 16, 1
12139; GFX10-NEXT:    v_add3_u32 v51, v51, v9, 0x7fff
12140; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v30
12141; GFX10-NEXT:    v_cndmask_b32_e32 v13, v26, v13, vcc_lo
12142; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
12143; GFX10-NEXT:    v_bfe_u32 v35, v8, 16, 1
12144; GFX10-NEXT:    v_add3_u32 v22, v22, v30, 0x7fff
12145; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v8
12146; GFX10-NEXT:    v_bfe_u32 v26, v29, 16, 1
12147; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
12148; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
12149; GFX10-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
12150; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v29
12151; GFX10-NEXT:    v_bfe_u32 v38, v7, 16, 1
12152; GFX10-NEXT:    v_add3_u32 v26, v26, v29, 0x7fff
12153; GFX10-NEXT:    v_cndmask_b32_e32 v12, v50, v12, vcc_lo
12154; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
12155; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v7
12156; GFX10-NEXT:    v_bfe_u32 v50, v28, 16, 1
12157; GFX10-NEXT:    v_add3_u32 v38, v38, v7, 0x7fff
12158; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v28
12159; GFX10-NEXT:    v_cndmask_b32_e32 v39, v51, v39, vcc_lo
12160; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
12161; GFX10-NEXT:    v_bfe_u32 v51, v6, 16, 1
12162; GFX10-NEXT:    v_add3_u32 v50, v50, v28, 0x7fff
12163; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v6
12164; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
12165; GFX10-NEXT:    v_cndmask_b32_e32 v11, v22, v11, vcc_lo
12166; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
12167; GFX10-NEXT:    v_bfe_u32 v22, v27, 16, 1
12168; GFX10-NEXT:    v_add3_u32 v51, v51, v6, 0x7fff
12169; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v27
12170; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
12171; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v49, vcc_lo
12172; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
12173; GFX10-NEXT:    v_bfe_u32 v49, v5, 16, 1
12174; GFX10-NEXT:    v_add3_u32 v22, v22, v27, 0x7fff
12175; GFX10-NEXT:    v_or_b32_e32 v29, 0x400000, v5
12176; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
12177; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
12178; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
12179; GFX10-NEXT:    v_add3_u32 v49, v49, v5, 0x7fff
12180; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v21
12181; GFX10-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
12182; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
12183; GFX10-NEXT:    v_bfe_u32 v38, v4, 16, 1
12184; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
12185; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v4
12186; GFX10-NEXT:    v_cndmask_b32_e32 v9, v50, v9, vcc_lo
12187; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
12188; GFX10-NEXT:    v_bfe_u32 v50, v20, 16, 1
12189; GFX10-NEXT:    v_add3_u32 v38, v38, v4, 0x7fff
12190; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v20
12191; GFX10-NEXT:    v_cndmask_b32_e32 v30, v51, v30, vcc_lo
12192; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
12193; GFX10-NEXT:    v_add3_u32 v50, v50, v20, 0x7fff
12194; GFX10-NEXT:    v_bfe_u32 v51, v3, 16, 1
12195; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v3
12196; GFX10-NEXT:    v_cndmask_b32_e32 v8, v22, v8, vcc_lo
12197; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
12198; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
12199; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v19
12200; GFX10-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
12201; GFX10-NEXT:    v_cndmask_b32_e32 v29, v49, v29, vcc_lo
12202; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
12203; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
12204; GFX10-NEXT:    v_bfe_u32 v49, v2, 16, 1
12205; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v2
12206; GFX10-NEXT:    v_cndmask_b32_e32 v7, v26, v7, vcc_lo
12207; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
12208; GFX10-NEXT:    v_bfe_u32 v26, v18, 16, 1
12209; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v18
12210; GFX10-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
12211; GFX10-NEXT:    v_cndmask_b32_e32 v28, v38, v28, vcc_lo
12212; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
12213; GFX10-NEXT:    v_bfe_u32 v38, v1, 16, 1
12214; GFX10-NEXT:    v_add3_u32 v26, v26, v18, 0x7fff
12215; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v1
12216; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
12217; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
12218; GFX10-NEXT:    v_bfe_u32 v50, v17, 16, 1
12219; GFX10-NEXT:    v_add3_u32 v38, v38, v1, 0x7fff
12220; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v17
12221; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
12222; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
12223; GFX10-NEXT:    v_bfe_u32 v22, v0, 16, 1
12224; GFX10-NEXT:    v_add3_u32 v50, v50, v17, 0x7fff
12225; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v0
12226; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v4, vcc_lo
12227; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
12228; GFX10-NEXT:    v_add3_u32 v22, v22, v0, 0x7fff
12229; GFX10-NEXT:    v_cndmask_b32_e32 v1, v38, v20, vcc_lo
12230; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
12231; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
12232; GFX10-NEXT:    v_cndmask_b32_e32 v17, v50, v19, vcc_lo
12233; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12234; GFX10-NEXT:    v_perm_b32 v4, v28, v7, 0x7060302
12235; GFX10-NEXT:    v_perm_b32 v7, v34, v10, 0x7060302
12236; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v18, vcc_lo
12237; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12238; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
12239; GFX10-NEXT:    v_cndmask_b32_e32 v2, v49, v21, vcc_lo
12240; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
12241; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
12242; GFX10-NEXT:    v_cndmask_b32_e32 v3, v51, v27, vcc_lo
12243; GFX10-NEXT:    v_perm_b32 v5, v29, v8, 0x7060302
12244; GFX10-NEXT:    v_perm_b32 v8, v35, v11, 0x7060302
12245; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
12246; GFX10-NEXT:    v_perm_b32 v6, v30, v9, 0x7060302
12247; GFX10-NEXT:    v_perm_b32 v9, v39, v12, 0x7060302
12248; GFX10-NEXT:    s_waitcnt vmcnt(0)
12249; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
12250; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
12251; GFX10-NEXT:    v_add_f32_e32 v17, v31, v17
12252; GFX10-NEXT:    v_add_f32_e32 v15, v15, v18
12253; GFX10-NEXT:    v_bfe_u32 v10, v17, 16, 1
12254; GFX10-NEXT:    v_bfe_u32 v11, v15, 16, 1
12255; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v17
12256; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
12257; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v15
12258; GFX10-NEXT:    v_add3_u32 v18, v10, v17, 0x7fff
12259; GFX10-NEXT:    v_add3_u32 v11, v11, v15, 0x7fff
12260; GFX10-NEXT:    v_perm_b32 v10, v37, v13, 0x7060302
12261; GFX10-NEXT:    v_perm_b32 v13, v36, v25, 0x7060302
12262; GFX10-NEXT:    v_cndmask_b32_e32 v17, v18, v12, vcc_lo
12263; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
12264; GFX10-NEXT:    v_perm_b32 v12, v33, v48, 0x7060302
12265; GFX10-NEXT:    v_cndmask_b32_e32 v15, v11, v19, vcc_lo
12266; GFX10-NEXT:    v_perm_b32 v11, v24, v14, 0x7060302
12267; GFX10-NEXT:    v_perm_b32 v14, v23, v16, 0x7060302
12268; GFX10-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
12269; GFX10-NEXT:    s_setpc_b64 s[30:31]
12270;
12271; GFX11-LABEL: v_fadd_v32bf16:
12272; GFX11:       ; %bb.0:
12273; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12274; GFX11-NEXT:    scratch_load_b32 v32, off, s32
12275; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
12276; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
12277; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
12278; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
12279; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
12280; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
12281; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
12282; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12283; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
12284; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
12285; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
12286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12287; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
12288; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
12289; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
12290; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
12291; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
12292; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
12293; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
12294; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
12295; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
12296; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
12297; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12298; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
12299; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
12300; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
12301; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12302; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
12303; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
12304; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
12305; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
12306; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
12307; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
12308; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
12309; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
12310; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
12311; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
12312; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
12313; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
12314; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
12315; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
12316; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
12317; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12318; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
12319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12320; GFX11-NEXT:    v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
12321; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
12322; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
12323; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
12324; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
12325; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
12326; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
12327; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
12328; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
12329; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
12330; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
12331; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
12332; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
12333; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
12334; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
12335; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
12336; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
12337; GFX11-NEXT:    v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
12338; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
12339; GFX11-NEXT:    v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
12340; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
12341; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
12342; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
12343; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
12344; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
12345; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
12346; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
12347; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
12348; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
12349; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
12350; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
12351; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
12352; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
12353; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
12354; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
12355; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
12356; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
12357; GFX11-NEXT:    v_add_f32_e32 v20, v80, v71
12358; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
12359; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
12360; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
12361; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
12362; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12363; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
12364; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
12365; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
12366; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
12367; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
12368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
12369; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
12370; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
12371; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
12372; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
12373; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
12374; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
12375; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
12376; GFX11-NEXT:    v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
12377; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
12378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12379; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
12380; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
12381; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
12382; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
12383; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
12384; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
12385; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
12386; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
12387; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
12388; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
12389; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
12390; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
12391; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
12392; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
12393; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
12394; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
12395; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
12396; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
12397; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
12398; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
12399; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
12400; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
12401; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
12402; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
12403; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
12404; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
12405; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
12406; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
12407; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
12408; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
12409; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
12410; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
12411; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
12412; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
12413; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
12414; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
12415; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
12416; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
12417; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
12418; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
12419; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
12420; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
12421; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
12422; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
12423; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
12424; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
12425; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
12426; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
12427; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
12428; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
12429; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
12430; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
12431; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
12432; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
12433; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
12434; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
12435; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
12436; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
12437; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
12438; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
12439; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
12440; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
12441; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
12442; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
12443; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
12444; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
12445; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
12446; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
12447; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
12448; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
12449; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
12450; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
12451; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
12452; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
12453; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
12454; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
12455; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
12456; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
12457; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
12458; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
12459; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
12460; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
12461; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
12462; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
12463; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
12464; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
12465; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
12466; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
12467; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
12468; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
12469; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
12470; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
12471; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
12472; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
12473; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
12474; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
12475; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
12476; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
12477; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
12478; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
12479; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
12480; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
12481; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
12482; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
12483; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
12484; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
12485; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
12486; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
12487; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
12488; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
12489; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
12490; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12491; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
12492; GFX11-NEXT:    v_cndmask_b32_e32 v23, v97, v98, vcc_lo
12493; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
12494; GFX11-NEXT:    v_cndmask_b32_e32 v6, v99, v100, vcc_lo
12495; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
12496; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
12497; GFX11-NEXT:    v_cndmask_b32_e32 v22, v101, v102, vcc_lo
12498; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
12499; GFX11-NEXT:    v_cndmask_b32_e32 v5, v103, v112, vcc_lo
12500; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
12501; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12502; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
12503; GFX11-NEXT:    v_cndmask_b32_e32 v21, v113, v114, vcc_lo
12504; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
12505; GFX11-NEXT:    v_cndmask_b32_e32 v4, v115, v116, vcc_lo
12506; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
12507; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
12508; GFX11-NEXT:    v_cndmask_b32_e32 v20, v117, v118, vcc_lo
12509; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
12510; GFX11-NEXT:    v_cndmask_b32_e32 v19, v129, v130, vcc_lo
12511; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
12512; GFX11-NEXT:    v_cndmask_b32_e32 v18, v133, v134, vcc_lo
12513; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
12514; GFX11-NEXT:    v_cndmask_b32_e32 v1, v135, v144, vcc_lo
12515; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
12516; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12517; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
12518; GFX11-NEXT:    v_cndmask_b32_e32 v17, v145, v146, vcc_lo
12519; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12520; GFX11-NEXT:    v_cndmask_b32_e32 v0, v147, v33, vcc_lo
12521; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12522; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
12523; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
12524; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
12525; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12526; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
12527; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
12528; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
12529; GFX11-NEXT:    s_waitcnt vmcnt(0)
12530; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
12531; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12532; GFX11-NEXT:    v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
12533; GFX11-NEXT:    v_add_f32_e32 v15, v15, v18
12534; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12535; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
12536; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
12537; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
12538; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
12539; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
12540; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
12541; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
12542; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12543; GFX11-NEXT:    v_cndmask_b32_e32 v17, v18, v20, vcc_lo
12544; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
12545; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
12546; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12547; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
12548; GFX11-NEXT:    s_setpc_b64 s[30:31]
12549  %op = fadd <32 x bfloat> %a, %b
12550  ret <32 x bfloat> %op
12551}
12552
12553define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
12554; GCN-LABEL: v_fadd_bf16_fpimm_0:
12555; GCN:       ; %bb.0:
12556; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12557; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12558; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12559; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
12560; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12561; GCN-NEXT:    s_setpc_b64 s[30:31]
12562;
12563; GFX7-LABEL: v_fadd_bf16_fpimm_0:
12564; GFX7:       ; %bb.0:
12565; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12566; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12567; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12568; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
12569; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12570; GFX7-NEXT:    s_setpc_b64 s[30:31]
12571;
12572; GFX8-LABEL: v_fadd_bf16_fpimm_0:
12573; GFX8:       ; %bb.0:
12574; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12575; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12576; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
12577; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
12578; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
12579; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
12580; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12581; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12582; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12583; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12584; GFX8-NEXT:    s_setpc_b64 s[30:31]
12585;
12586; GFX9-LABEL: v_fadd_bf16_fpimm_0:
12587; GFX9:       ; %bb.0:
12588; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12589; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12590; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
12591; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
12592; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
12593; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
12594; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12595; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12596; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12597; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12598; GFX9-NEXT:    s_setpc_b64 s[30:31]
12599;
12600; GFX10-LABEL: v_fadd_bf16_fpimm_0:
12601; GFX10:       ; %bb.0:
12602; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12603; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12604; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
12605; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
12606; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12607; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12608; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12609; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12610; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12611; GFX10-NEXT:    s_setpc_b64 s[30:31]
12612;
12613; GFX11-LABEL: v_fadd_bf16_fpimm_0:
12614; GFX11:       ; %bb.0:
12615; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12616; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12617; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12618; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
12619; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
12620; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12621; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12622; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12623; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12624; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12625; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12626; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12627; GFX11-NEXT:    s_setpc_b64 s[30:31]
12628  %add = fadd bfloat %arg0, 1.0
12629  ret bfloat %add
12630}
12631
12632define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
12633; GCN-LABEL: v_fadd_bf16_fpimm_1:
12634; GCN:       ; %bb.0:
12635; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12636; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12637; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12638; GCN-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12639; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12640; GCN-NEXT:    s_setpc_b64 s[30:31]
12641;
12642; GFX7-LABEL: v_fadd_bf16_fpimm_1:
12643; GFX7:       ; %bb.0:
12644; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12645; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12646; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12647; GFX7-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12648; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12649; GFX7-NEXT:    s_setpc_b64 s[30:31]
12650;
12651; GFX8-LABEL: v_fadd_bf16_fpimm_1:
12652; GFX8:       ; %bb.0:
12653; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12654; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12655; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12656; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
12657; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
12658; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
12659; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12660; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12661; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12662; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12663; GFX8-NEXT:    s_setpc_b64 s[30:31]
12664;
12665; GFX9-LABEL: v_fadd_bf16_fpimm_1:
12666; GFX9:       ; %bb.0:
12667; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12668; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12669; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12670; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
12671; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
12672; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
12673; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12674; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12675; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12676; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12677; GFX9-NEXT:    s_setpc_b64 s[30:31]
12678;
12679; GFX10-LABEL: v_fadd_bf16_fpimm_1:
12680; GFX10:       ; %bb.0:
12681; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12682; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12683; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12684; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
12685; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12686; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12687; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12688; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12689; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12690; GFX10-NEXT:    s_setpc_b64 s[30:31]
12691;
12692; GFX11-LABEL: v_fadd_bf16_fpimm_1:
12693; GFX11:       ; %bb.0:
12694; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12695; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12696; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12697; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
12698; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
12699; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12700; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12702; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12703; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12705; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12706; GFX11-NEXT:    s_setpc_b64 s[30:31]
12707  %add = fadd bfloat %arg0, 42.0
12708  ret bfloat %add
12709}
12710
12711define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
12712; GCN-LABEL: v_fsub_bf16:
12713; GCN:       ; %bb.0:
12714; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12715; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12716; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12717; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12718; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12719; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
12720; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12721; GCN-NEXT:    s_setpc_b64 s[30:31]
12722;
12723; GFX7-LABEL: v_fsub_bf16:
12724; GFX7:       ; %bb.0:
12725; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12726; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12727; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12728; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12729; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12730; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
12731; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12732; GFX7-NEXT:    s_setpc_b64 s[30:31]
12733;
12734; GFX8-LABEL: v_fsub_bf16:
12735; GFX8:       ; %bb.0:
12736; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12737; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12738; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12739; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
12740; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
12741; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
12742; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
12743; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12744; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12745; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12746; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12747; GFX8-NEXT:    s_setpc_b64 s[30:31]
12748;
12749; GFX9-LABEL: v_fsub_bf16:
12750; GFX9:       ; %bb.0:
12751; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12752; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12753; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12754; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
12755; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
12756; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
12757; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
12758; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12759; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12760; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
12761; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12762; GFX9-NEXT:    s_setpc_b64 s[30:31]
12763;
12764; GFX10-LABEL: v_fsub_bf16:
12765; GFX10:       ; %bb.0:
12766; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12767; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12768; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12769; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
12770; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
12771; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12772; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12773; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12774; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12775; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12776; GFX10-NEXT:    s_setpc_b64 s[30:31]
12777;
12778; GFX11-LABEL: v_fsub_bf16:
12779; GFX11:       ; %bb.0:
12780; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12781; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12782; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12783; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12784; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
12785; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
12786; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
12787; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12788; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12789; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
12790; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12791; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12792; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12793; GFX11-NEXT:    s_setpc_b64 s[30:31]
12794  %op = fsub bfloat %a, %b
12795  ret bfloat %op
12796}
12797
12798define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
12799; GCN-LABEL: v_fsub_v2bf16:
12800; GCN:       ; %bb.0:
12801; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12802; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12803; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12804; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12805; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
12806; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12807; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12808; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12809; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12810; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
12811; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
12812; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12813; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12814; GCN-NEXT:    s_setpc_b64 s[30:31]
12815;
12816; GFX7-LABEL: v_fsub_v2bf16:
12817; GFX7:       ; %bb.0:
12818; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12819; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12820; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12821; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12822; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
12823; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12824; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12825; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12826; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12827; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
12828; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
12829; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12830; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12831; GFX7-NEXT:    s_setpc_b64 s[30:31]
12832;
12833; GFX8-LABEL: v_fsub_v2bf16:
12834; GFX8:       ; %bb.0:
12835; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12836; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
12837; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
12838; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
12839; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
12840; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
12841; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12842; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12843; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
12844; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
12845; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
12846; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12847; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
12848; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
12849; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
12850; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
12851; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
12852; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12853; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
12854; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12855; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
12856; GFX8-NEXT:    s_setpc_b64 s[30:31]
12857;
12858; GFX9-LABEL: v_fsub_v2bf16:
12859; GFX9:       ; %bb.0:
12860; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12861; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
12862; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
12863; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
12864; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12865; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12866; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
12867; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
12868; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
12869; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
12870; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
12871; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12872; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
12873; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
12874; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
12875; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
12876; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12877; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
12878; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
12879; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
12880; GFX9-NEXT:    s_setpc_b64 s[30:31]
12881;
12882; GFX10-LABEL: v_fsub_v2bf16:
12883; GFX10:       ; %bb.0:
12884; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12885; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
12886; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
12887; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12888; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12889; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
12890; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
12891; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
12892; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
12893; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
12894; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12895; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
12896; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
12897; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
12898; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12899; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12900; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12901; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
12902; GFX10-NEXT:    s_setpc_b64 s[30:31]
12903;
12904; GFX11-LABEL: v_fsub_v2bf16:
12905; GFX11:       ; %bb.0:
12906; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12907; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
12908; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12909; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
12910; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12911; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
12912; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
12913; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
12914; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12915; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
12916; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
12917; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
12918; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12919; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
12920; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
12921; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
12922; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12923; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12924; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12925; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12926; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12927; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
12928; GFX11-NEXT:    s_setpc_b64 s[30:31]
12929  %op = fsub <2 x bfloat> %a, %b
12930  ret <2 x bfloat> %op
12931}
12932
12933define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
12934; GCN-LABEL: v_fsub_v3bf16:
12935; GCN:       ; %bb.0:
12936; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12937; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12938; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
12939; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12940; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
12941; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12942; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
12943; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
12944; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12945; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
12946; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12947; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12948; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12949; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
12950; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
12951; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
12952; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12953; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12954; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12955; GCN-NEXT:    s_setpc_b64 s[30:31]
12956;
12957; GFX7-LABEL: v_fsub_v3bf16:
12958; GFX7:       ; %bb.0:
12959; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12960; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
12961; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
12962; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
12963; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
12964; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12965; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
12966; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
12967; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12968; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
12969; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12970; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
12971; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12972; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
12973; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
12974; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
12975; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
12976; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
12977; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12978; GFX7-NEXT:    s_setpc_b64 s[30:31]
12979;
12980; GFX8-LABEL: v_fsub_v3bf16:
12981; GFX8:       ; %bb.0:
12982; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12983; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12984; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12985; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
12986; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
12987; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
12988; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
12989; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
12990; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
12991; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
12992; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
12993; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
12994; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
12995; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
12996; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
12997; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
12998; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12999; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13000; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
13001; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
13002; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13003; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13004; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
13005; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
13006; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
13007; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
13008; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
13009; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13010; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
13011; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13012; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13013; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
13014; GFX8-NEXT:    s_setpc_b64 s[30:31]
13015;
13016; GFX9-LABEL: v_fsub_v3bf16:
13017; GFX9:       ; %bb.0:
13018; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13019; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13020; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13021; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
13022; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
13023; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13024; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
13025; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
13026; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13027; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
13028; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13029; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
13030; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
13031; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13032; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13033; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
13034; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
13035; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
13036; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13037; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13038; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
13039; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
13040; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
13041; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
13042; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13043; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
13044; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
13045; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
13046; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
13047; GFX9-NEXT:    s_setpc_b64 s[30:31]
13048;
13049; GFX10-LABEL: v_fsub_v3bf16:
13050; GFX10:       ; %bb.0:
13051; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13052; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13053; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13054; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13055; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13056; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13057; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13058; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
13059; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
13060; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
13061; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
13062; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13063; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
13064; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13065; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
13066; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13067; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13068; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13069; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13070; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13071; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13072; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13073; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13074; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13075; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13076; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13077; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
13078; GFX10-NEXT:    s_setpc_b64 s[30:31]
13079;
13080; GFX11TRUE16-LABEL: v_fsub_v3bf16:
13081; GFX11TRUE16:       ; %bb.0:
13082; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13083; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13084; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13085; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13086; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13087; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13088; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13089; GFX11TRUE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13090; GFX11TRUE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13091; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13092; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
13093; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13094; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13095; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
13096; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
13097; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13098; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13099; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13100; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13101; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13102; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13103; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13104; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13105; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13106; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13107; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13108; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13109; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13110; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
13111; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
13112;
13113; GFX11FAKE16-LABEL: v_fsub_v3bf16:
13114; GFX11FAKE16:       ; %bb.0:
13115; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13116; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13117; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13118; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13119; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13120; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13121; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13122; GFX11FAKE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13123; GFX11FAKE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13124; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13125; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
13126; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13127; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13128; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
13129; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
13130; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13131; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13132; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13133; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13134; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13135; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13136; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13137; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13138; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13139; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13140; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13141; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13142; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13143; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
13144; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
13145  %op = fsub <3 x bfloat> %a, %b
13146  ret <3 x bfloat> %op
13147}
13148
13149define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13150; GCN-LABEL: v_fsub_v4bf16:
13151; GCN:       ; %bb.0:
13152; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13153; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13154; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13155; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13156; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13157; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13158; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
13159; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13160; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
13161; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
13162; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13163; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
13164; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13165; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13166; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13167; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13168; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13169; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
13170; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
13171; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
13172; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
13173; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13174; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13175; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13176; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13177; GCN-NEXT:    s_setpc_b64 s[30:31]
13178;
13179; GFX7-LABEL: v_fsub_v4bf16:
13180; GFX7:       ; %bb.0:
13181; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13182; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13183; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13184; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13185; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13186; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13187; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
13188; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13189; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
13190; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
13191; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13192; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
13193; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13194; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13195; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13196; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13197; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13198; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
13199; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
13200; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
13201; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
13202; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13203; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13204; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13205; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13206; GFX7-NEXT:    s_setpc_b64 s[30:31]
13207;
13208; GFX8-LABEL: v_fsub_v4bf16:
13209; GFX8:       ; %bb.0:
13210; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13211; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13212; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13213; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
13214; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
13215; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
13216; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13217; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13218; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
13219; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
13220; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
13221; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13222; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
13223; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
13224; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
13225; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
13226; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
13227; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
13228; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13229; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
13230; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13231; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13232; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
13233; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
13234; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
13235; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13236; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13237; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
13238; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
13239; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
13240; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13241; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
13242; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
13243; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
13244; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
13245; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13246; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13247; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
13248; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13249; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13250; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
13251; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
13252; GFX8-NEXT:    s_setpc_b64 s[30:31]
13253;
13254; GFX9-LABEL: v_fsub_v4bf16:
13255; GFX9:       ; %bb.0:
13256; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13257; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13258; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13259; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
13260; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13261; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13262; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
13263; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13264; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
13265; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
13266; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
13267; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13268; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
13269; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
13270; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
13271; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
13272; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13273; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
13274; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13275; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13276; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
13277; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13278; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13279; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
13280; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
13281; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
13282; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
13283; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13284; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
13285; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
13286; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
13287; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13288; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13289; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
13290; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
13291; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
13292; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
13293; GFX9-NEXT:    s_setpc_b64 s[30:31]
13294;
13295; GFX10-LABEL: v_fsub_v4bf16:
13296; GFX10:       ; %bb.0:
13297; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13298; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13299; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13300; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13301; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13302; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13303; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
13304; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
13305; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13306; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13307; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
13308; GFX10-NEXT:    v_sub_f32_e32 v3, v7, v6
13309; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
13310; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13311; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
13312; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13313; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
13314; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
13315; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
13316; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
13317; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
13318; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
13319; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13320; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13321; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13322; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
13323; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13324; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
13325; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13326; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13327; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13328; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13329; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
13330; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13331; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
13332; GFX10-NEXT:    s_setpc_b64 s[30:31]
13333;
13334; GFX11-LABEL: v_fsub_v4bf16:
13335; GFX11:       ; %bb.0:
13336; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13337; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13338; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
13339; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13340; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13341; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13342; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13344; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
13345; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13346; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
13347; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13348; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
13349; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
13350; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
13351; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
13352; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
13353; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
13354; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13355; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13356; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
13357; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
13358; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
13359; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
13360; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
13361; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13362; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13363; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13364; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
13365; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13366; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13367; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13369; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13370; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13371; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
13372; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13373; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13374; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
13375; GFX11-NEXT:    s_setpc_b64 s[30:31]
13376  %op = fsub <4 x bfloat> %a, %b
13377  ret <4 x bfloat> %op
13378}
13379
13380define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
13381; GCN-LABEL: v_fmul_bf16:
13382; GCN:       ; %bb.0:
13383; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13384; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13385; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13386; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13387; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13388; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
13389; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13390; GCN-NEXT:    s_setpc_b64 s[30:31]
13391;
13392; GFX7-LABEL: v_fmul_bf16:
13393; GFX7:       ; %bb.0:
13394; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13395; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13396; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13397; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13398; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13399; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
13400; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13401; GFX7-NEXT:    s_setpc_b64 s[30:31]
13402;
13403; GFX8-LABEL: v_fmul_bf16:
13404; GFX8:       ; %bb.0:
13405; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13406; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13407; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13408; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
13409; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
13410; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
13411; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
13412; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
13413; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13414; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
13415; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13416; GFX8-NEXT:    s_setpc_b64 s[30:31]
13417;
13418; GFX9-LABEL: v_fmul_bf16:
13419; GFX9:       ; %bb.0:
13420; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13421; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13422; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13423; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
13424; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
13425; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13426; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
13427; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
13428; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13429; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
13430; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13431; GFX9-NEXT:    s_setpc_b64 s[30:31]
13432;
13433; GFX10-LABEL: v_fmul_bf16:
13434; GFX10:       ; %bb.0:
13435; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13436; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13437; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13438; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
13439; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
13440; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
13441; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13442; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
13443; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13444; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13445; GFX10-NEXT:    s_setpc_b64 s[30:31]
13446;
13447; GFX11-LABEL: v_fmul_bf16:
13448; GFX11:       ; %bb.0:
13449; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13450; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13451; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13452; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13453; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
13454; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
13455; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
13456; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13457; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13458; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
13459; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13460; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13461; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13462; GFX11-NEXT:    s_setpc_b64 s[30:31]
13463  %op = fmul bfloat %a, %b
13464  ret bfloat %op
13465}
13466
13467define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
13468; GCN-LABEL: v_fmul_v2bf16:
13469; GCN:       ; %bb.0:
13470; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13471; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13472; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13473; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13474; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13475; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13476; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13477; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13478; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13479; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
13480; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
13481; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13482; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13483; GCN-NEXT:    s_setpc_b64 s[30:31]
13484;
13485; GFX7-LABEL: v_fmul_v2bf16:
13486; GFX7:       ; %bb.0:
13487; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13488; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13489; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13490; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13491; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13492; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13493; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13494; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13495; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13496; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
13497; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
13498; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13499; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13500; GFX7-NEXT:    s_setpc_b64 s[30:31]
13501;
13502; GFX8-LABEL: v_fmul_v2bf16:
13503; GFX8:       ; %bb.0:
13504; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13505; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
13506; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
13507; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
13508; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
13509; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
13510; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13511; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13512; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
13513; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
13514; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
13515; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13516; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
13517; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
13518; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
13519; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
13520; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
13521; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13522; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
13523; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13524; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
13525; GFX8-NEXT:    s_setpc_b64 s[30:31]
13526;
13527; GFX9-LABEL: v_fmul_v2bf16:
13528; GFX9:       ; %bb.0:
13529; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13530; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
13531; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
13532; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
13533; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13534; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13535; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
13536; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13537; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
13538; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
13539; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
13540; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13541; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
13542; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
13543; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
13544; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
13545; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13546; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
13547; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
13548; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
13549; GFX9-NEXT:    s_setpc_b64 s[30:31]
13550;
13551; GFX10-LABEL: v_fmul_v2bf16:
13552; GFX10:       ; %bb.0:
13553; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13554; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
13555; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
13556; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13557; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13558; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
13559; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
13560; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
13561; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
13562; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
13563; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
13564; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13565; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
13566; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
13567; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13568; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13569; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13570; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
13571; GFX10-NEXT:    s_setpc_b64 s[30:31]
13572;
13573; GFX11-LABEL: v_fmul_v2bf16:
13574; GFX11:       ; %bb.0:
13575; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13576; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
13577; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13578; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
13579; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13580; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
13581; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
13582; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
13583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13584; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
13585; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
13586; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
13587; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
13588; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13589; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
13590; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
13591; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
13592; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13593; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13594; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13595; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13596; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
13597; GFX11-NEXT:    s_setpc_b64 s[30:31]
13598  %op = fmul <2 x bfloat> %a, %b
13599  ret <2 x bfloat> %op
13600}
13601
13602define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
13603; GCN-LABEL: v_fmul_v3bf16:
13604; GCN:       ; %bb.0:
13605; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13606; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13607; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13608; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13609; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13610; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13611; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13612; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13613; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13614; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13615; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13616; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13617; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13618; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
13619; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
13620; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
13621; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13622; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13623; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13624; GCN-NEXT:    s_setpc_b64 s[30:31]
13625;
13626; GFX7-LABEL: v_fmul_v3bf16:
13627; GFX7:       ; %bb.0:
13628; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13629; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13630; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13631; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13632; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13633; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13634; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13635; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13636; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13637; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13638; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13639; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13640; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13641; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
13642; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
13643; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
13644; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13645; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13646; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13647; GFX7-NEXT:    s_setpc_b64 s[30:31]
13648;
13649; GFX8-LABEL: v_fmul_v3bf16:
13650; GFX8:       ; %bb.0:
13651; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13652; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13653; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13654; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
13655; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
13656; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
13657; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
13658; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
13659; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13660; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
13661; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13662; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
13663; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
13664; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
13665; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
13666; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
13667; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13668; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13669; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
13670; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
13671; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13672; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13673; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
13674; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
13675; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
13676; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
13677; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
13678; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13679; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
13680; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13681; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13682; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
13683; GFX8-NEXT:    s_setpc_b64 s[30:31]
13684;
13685; GFX9-LABEL: v_fmul_v3bf16:
13686; GFX9:       ; %bb.0:
13687; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13688; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13689; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13690; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
13691; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
13692; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13693; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
13694; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
13695; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13696; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
13697; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13698; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
13699; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
13700; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13701; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13702; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
13703; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
13704; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
13705; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13706; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13707; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
13708; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
13709; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
13710; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
13711; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13712; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
13713; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
13714; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
13715; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
13716; GFX9-NEXT:    s_setpc_b64 s[30:31]
13717;
13718; GFX10-LABEL: v_fmul_v3bf16:
13719; GFX10:       ; %bb.0:
13720; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13721; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13722; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13723; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13724; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13725; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13726; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13727; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
13728; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
13729; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
13730; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
13731; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13732; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
13733; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13734; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
13735; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13736; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13737; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13738; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13739; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13740; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13741; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13742; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13743; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13744; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13745; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13746; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
13747; GFX10-NEXT:    s_setpc_b64 s[30:31]
13748;
13749; GFX11TRUE16-LABEL: v_fmul_v3bf16:
13750; GFX11TRUE16:       ; %bb.0:
13751; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13752; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13753; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13754; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13755; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13756; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13757; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13758; GFX11TRUE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13759; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13760; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13761; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
13762; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13763; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13764; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
13765; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
13766; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13767; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13768; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13769; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13770; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13771; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13772; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13773; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13774; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13775; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13776; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13777; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13778; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13779; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
13780; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
13781;
13782; GFX11FAKE16-LABEL: v_fmul_v3bf16:
13783; GFX11FAKE16:       ; %bb.0:
13784; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13785; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13786; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13787; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13788; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13789; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13790; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13791; GFX11FAKE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13792; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13793; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13794; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
13795; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13796; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13797; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
13798; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
13799; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
13800; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13801; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
13802; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
13803; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
13804; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13805; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13806; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13807; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13808; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13809; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
13810; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13811; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13812; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
13813; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
13814  %op = fmul <3 x bfloat> %a, %b
13815  ret <3 x bfloat> %op
13816}
13817
13818define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13819; GCN-LABEL: v_fmul_v4bf16:
13820; GCN:       ; %bb.0:
13821; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13822; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13823; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13824; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13825; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13826; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13827; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
13828; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13829; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
13830; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
13831; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13832; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
13833; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13834; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13835; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13836; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13837; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13838; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
13839; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
13840; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
13841; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
13842; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13843; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13844; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13845; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13846; GCN-NEXT:    s_setpc_b64 s[30:31]
13847;
13848; GFX7-LABEL: v_fmul_v4bf16:
13849; GFX7:       ; %bb.0:
13850; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13851; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
13852; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
13853; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
13854; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
13855; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13856; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
13857; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
13858; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
13859; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
13860; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13861; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
13862; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13863; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
13864; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13865; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
13866; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13867; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
13868; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
13869; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
13870; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
13871; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13872; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13873; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13874; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13875; GFX7-NEXT:    s_setpc_b64 s[30:31]
13876;
13877; GFX8-LABEL: v_fmul_v4bf16:
13878; GFX8:       ; %bb.0:
13879; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13880; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13881; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13882; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
13883; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
13884; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
13885; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13886; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13887; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
13888; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
13889; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
13890; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13891; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
13892; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
13893; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
13894; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
13895; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
13896; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
13897; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13898; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
13899; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13900; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13901; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
13902; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
13903; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
13904; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13905; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13906; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
13907; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
13908; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
13909; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13910; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
13911; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
13912; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
13913; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
13914; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13915; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13916; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
13917; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13918; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
13919; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
13920; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
13921; GFX8-NEXT:    s_setpc_b64 s[30:31]
13922;
13923; GFX9-LABEL: v_fmul_v4bf16:
13924; GFX9:       ; %bb.0:
13925; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13926; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13927; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13928; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
13929; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13930; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13931; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
13932; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
13933; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
13934; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
13935; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
13936; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13937; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
13938; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
13939; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
13940; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
13941; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
13942; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
13943; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
13944; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
13945; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
13946; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13947; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13948; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
13949; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
13950; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
13951; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
13952; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13953; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
13954; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
13955; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
13956; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
13957; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
13958; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
13959; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
13960; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
13961; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
13962; GFX9-NEXT:    s_setpc_b64 s[30:31]
13963;
13964; GFX10-LABEL: v_fmul_v4bf16:
13965; GFX10:       ; %bb.0:
13966; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13967; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
13968; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
13969; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
13970; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
13971; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13972; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
13973; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
13974; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13975; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
13976; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
13977; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
13978; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
13979; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
13980; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
13981; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
13982; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
13983; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
13984; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
13985; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
13986; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
13987; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
13988; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13989; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
13990; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13991; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
13992; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
13993; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
13994; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13995; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
13996; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13997; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
13998; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
13999; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14000; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
14001; GFX10-NEXT:    s_setpc_b64 s[30:31]
14002;
14003; GFX11-LABEL: v_fmul_v4bf16:
14004; GFX11:       ; %bb.0:
14005; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14006; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
14007; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
14008; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14009; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14010; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
14011; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
14012; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14013; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
14014; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14015; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
14016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14017; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
14018; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
14019; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
14020; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
14021; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
14022; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
14023; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
14024; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
14025; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
14026; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
14027; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
14028; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
14029; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
14030; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
14031; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
14032; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
14033; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
14034; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14035; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
14036; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
14037; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14038; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
14039; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
14040; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
14041; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14042; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14043; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
14044; GFX11-NEXT:    s_setpc_b64 s[30:31]
14045  %op = fmul <4 x bfloat> %a, %b
14046  ret <4 x bfloat> %op
14047}
14048
14049define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
14050; GCN-LABEL: v_fmul_v8bf16:
14051; GCN:       ; %bb.0:
14052; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14053; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14054; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
14055; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14056; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
14057; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14058; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
14059; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14060; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
14061; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
14062; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
14063; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
14064; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
14065; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
14066; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
14067; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
14068; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
14069; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14070; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14071; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14072; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14073; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14074; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14075; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14076; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14077; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14078; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14079; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14080; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14081; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14082; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14083; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14084; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14085; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
14086; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
14087; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
14088; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
14089; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
14090; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
14091; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
14092; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
14093; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14094; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14095; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14096; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14097; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14098; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14099; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14100; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14101; GCN-NEXT:    s_setpc_b64 s[30:31]
14102;
14103; GFX7-LABEL: v_fmul_v8bf16:
14104; GFX7:       ; %bb.0:
14105; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14106; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14107; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
14108; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14109; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
14110; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14111; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
14112; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14113; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
14114; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
14115; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
14116; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
14117; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
14118; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
14119; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
14120; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
14121; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
14122; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14123; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14124; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14125; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14126; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14127; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14128; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14129; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14130; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14131; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14132; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14133; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14134; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14135; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14136; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14137; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14138; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v15
14139; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v14
14140; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v13
14141; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v12
14142; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v11
14143; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v10
14144; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
14145; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
14146; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14147; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14148; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14149; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14150; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14151; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14152; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14153; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14154; GFX7-NEXT:    s_setpc_b64 s[30:31]
14155;
14156; GFX8-LABEL: v_fmul_v8bf16:
14157; GFX8:       ; %bb.0:
14158; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14159; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
14160; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
14161; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
14162; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
14163; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
14164; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14165; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14166; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14167; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
14168; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
14169; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
14170; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
14171; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
14172; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
14173; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
14174; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
14175; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14176; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14177; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
14178; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
14179; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
14180; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
14181; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
14182; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
14183; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14184; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14185; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
14186; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
14187; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
14188; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
14189; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
14190; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
14191; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
14192; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
14193; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14194; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14195; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
14196; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
14197; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
14198; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
14199; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
14200; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
14201; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14202; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14203; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
14204; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
14205; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14206; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
14207; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
14208; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
14209; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
14210; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
14211; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
14212; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
14213; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
14214; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
14215; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
14216; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
14217; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14218; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14219; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14220; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14221; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
14222; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
14223; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14224; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14225; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
14226; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14227; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
14228; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
14229; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14230; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
14231; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
14232; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
14233; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14234; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14235; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
14236; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
14237; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
14238; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
14239; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
14240; GFX8-NEXT:    s_setpc_b64 s[30:31]
14241;
14242; GFX9-LABEL: v_fmul_v8bf16:
14243; GFX9:       ; %bb.0:
14244; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14245; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
14246; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
14247; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
14248; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14249; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14250; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
14251; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
14252; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
14253; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
14254; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
14255; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
14256; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
14257; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
14258; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
14259; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14260; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14261; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
14262; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
14263; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
14264; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
14265; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14266; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14267; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
14268; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
14269; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
14270; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
14271; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
14272; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
14273; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
14274; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
14275; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14276; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14277; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
14278; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
14279; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
14280; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
14281; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14282; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14283; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
14284; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
14285; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
14286; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14287; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
14288; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
14289; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
14290; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
14291; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
14292; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
14293; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
14294; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
14295; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
14296; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
14297; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14298; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14299; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
14300; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
14301; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
14302; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14303; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14304; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
14305; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14306; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
14307; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14308; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
14309; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
14310; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
14311; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
14312; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
14313; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
14314; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
14315; GFX9-NEXT:    s_setpc_b64 s[30:31]
14316;
14317; GFX10-LABEL: v_fmul_v8bf16:
14318; GFX10:       ; %bb.0:
14319; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14320; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
14321; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
14322; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14323; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14324; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
14325; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14326; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
14327; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
14328; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14329; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
14330; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
14331; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
14332; GFX10-NEXT:    v_mul_f32_e32 v7, v10, v9
14333; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
14334; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
14335; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
14336; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
14337; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
14338; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
14339; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
14340; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
14341; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14342; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
14343; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
14344; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
14345; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
14346; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
14347; GFX10-NEXT:    v_mul_f32_e32 v6, v10, v6
14348; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
14349; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14350; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14351; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
14352; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
14353; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
14354; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
14355; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14356; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14357; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
14358; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
14359; GFX10-NEXT:    v_mul_f32_e32 v5, v15, v13
14360; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
14361; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
14362; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14363; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
14364; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14365; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
14366; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
14367; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
14368; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
14369; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
14370; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
14371; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
14372; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
14373; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
14374; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14375; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
14376; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
14377; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
14378; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14379; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
14380; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14381; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
14382; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
14383; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14384; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
14385; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
14386; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14387; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
14388; GFX10-NEXT:    s_setpc_b64 s[30:31]
14389;
14390; GFX11-LABEL: v_fmul_v8bf16:
14391; GFX11:       ; %bb.0:
14392; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14393; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
14394; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
14395; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14396; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
14397; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
14398; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14399; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
14400; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
14401; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
14402; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14403; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
14404; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14405; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
14406; GFX11-NEXT:    v_mul_f32_e32 v7, v10, v9
14407; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
14408; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
14409; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14410; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
14411; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
14412; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
14413; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
14414; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14415; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
14416; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
14417; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
14418; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
14419; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14420; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
14421; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14422; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
14423; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
14424; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
14425; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14426; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
14427; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14428; GFX11-NEXT:    v_mul_f32_e32 v6, v10, v6
14429; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
14430; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
14431; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
14432; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
14433; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
14434; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14435; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14436; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
14437; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
14438; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14439; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14440; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
14441; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
14442; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
14443; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
14444; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
14445; GFX11-NEXT:    v_mul_f32_e32 v5, v15, v13
14446; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14447; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
14448; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
14449; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
14450; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
14451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14452; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
14453; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
14454; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14455; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
14456; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
14457; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
14458; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
14459; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14460; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
14461; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14462; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
14463; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
14464; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
14465; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14466; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
14467; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
14468; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14469; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14470; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
14471; GFX11-NEXT:    s_setpc_b64 s[30:31]
14472  %op = fmul <8 x bfloat> %a, %b
14473  ret <8 x bfloat> %op
14474}
14475
14476define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
14477; GCN-LABEL: v_fmul_v16bf16:
14478; GCN:       ; %bb.0:
14479; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14480; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
14481; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
14482; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
14483; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14484; GCN-NEXT:    v_mul_f32_e32 v14, v14, v30
14485; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
14486; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
14487; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
14488; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14489; GCN-NEXT:    v_mul_f32_e32 v13, v13, v29
14490; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
14491; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
14492; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
14493; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14494; GCN-NEXT:    v_mul_f32_e32 v12, v12, v28
14495; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
14496; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
14497; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
14498; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
14499; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
14500; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
14501; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
14502; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
14503; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
14504; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
14505; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
14506; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
14507; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
14508; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
14509; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
14510; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
14511; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14512; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
14513; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14514; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
14515; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14516; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
14517; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14518; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
14519; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
14520; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
14521; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14522; GCN-NEXT:    v_mul_f32_e32 v11, v11, v27
14523; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32
14524; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
14525; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14526; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
14527; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14528; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
14529; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14530; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
14531; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14532; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
14533; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14534; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
14535; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14536; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
14537; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14538; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
14539; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14540; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
14541; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14542; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
14543; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14544; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
14545; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14546; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14547; GCN-NEXT:    v_mul_f32_e32 v10, v10, v26
14548; GCN-NEXT:    v_mul_f32_e32 v9, v9, v25
14549; GCN-NEXT:    v_mul_f32_e32 v8, v8, v24
14550; GCN-NEXT:    v_mul_f32_e32 v7, v7, v23
14551; GCN-NEXT:    v_mul_f32_e32 v6, v6, v22
14552; GCN-NEXT:    v_mul_f32_e32 v5, v5, v21
14553; GCN-NEXT:    v_mul_f32_e32 v4, v4, v20
14554; GCN-NEXT:    v_mul_f32_e32 v3, v3, v19
14555; GCN-NEXT:    v_mul_f32_e32 v2, v2, v18
14556; GCN-NEXT:    v_mul_f32_e32 v1, v1, v17
14557; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
14558; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14559; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14560; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14561; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14562; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14563; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14564; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14565; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14566; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14567; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14568; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14569; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14570; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14571; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14572; GCN-NEXT:    s_waitcnt vmcnt(0)
14573; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
14574; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
14575; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
14576; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14577; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14578; GCN-NEXT:    s_setpc_b64 s[30:31]
14579;
14580; GFX7-LABEL: v_fmul_v16bf16:
14581; GFX7:       ; %bb.0:
14582; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14583; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
14584; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
14585; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
14586; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14587; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v27
14588; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32
14589; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
14590; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
14591; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
14592; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14593; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
14594; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
14595; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
14596; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
14597; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
14598; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
14599; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
14600; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
14601; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
14602; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
14603; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
14604; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
14605; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
14606; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
14607; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
14608; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
14609; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v22
14610; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
14611; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
14612; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
14613; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14614; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
14615; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14616; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
14617; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14618; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
14619; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14620; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
14621; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
14622; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14623; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
14624; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14625; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
14626; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14627; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
14628; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14629; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
14630; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14631; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
14632; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14633; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
14634; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14635; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14636; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
14637; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14638; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
14639; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14640; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
14641; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14642; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
14643; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14644; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
14645; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14646; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
14647; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14648; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v30
14649; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v29
14650; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v28
14651; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v26
14652; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v25
14653; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v24
14654; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v23
14655; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v21
14656; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v20
14657; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v19
14658; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v18
14659; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v17
14660; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
14661; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14662; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14663; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14664; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14665; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14666; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14667; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14668; GFX7-NEXT:    s_waitcnt vmcnt(0)
14669; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v27
14670; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
14671; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v22
14672; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14673; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14674; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14675; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14676; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14677; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14678; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14679; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14680; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14681; GFX7-NEXT:    s_setpc_b64 s[30:31]
14682;
14683; GFX8-LABEL: v_fmul_v16bf16:
14684; GFX8:       ; %bb.0:
14685; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14686; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
14687; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
14688; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
14689; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
14690; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
14691; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
14692; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14693; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14694; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14695; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
14696; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
14697; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
14698; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
14699; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
14700; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
14701; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
14702; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
14703; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
14704; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
14705; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
14706; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
14707; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
14708; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
14709; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
14710; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14711; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14712; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14713; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
14714; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
14715; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
14716; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
14717; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
14718; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
14719; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
14720; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
14721; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
14722; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
14723; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
14724; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
14725; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
14726; GFX8-NEXT:    v_bfe_u32 v17, v14, 16, 1
14727; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v14
14728; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14729; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14730; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14731; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
14732; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
14733; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
14734; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
14735; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
14736; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
14737; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
14738; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
14739; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14740; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
14741; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
14742; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
14743; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
14744; GFX8-NEXT:    v_bfe_u32 v17, v13, 16, 1
14745; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
14746; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14747; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14748; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14749; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
14750; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
14751; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
14752; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
14753; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
14754; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
14755; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
14756; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
14757; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
14758; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
14759; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
14760; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
14761; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
14762; GFX8-NEXT:    v_bfe_u32 v17, v12, 16, 1
14763; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v12
14764; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14765; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14766; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14767; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
14768; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
14769; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
14770; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
14771; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
14772; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
14773; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
14774; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
14775; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14776; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
14777; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
14778; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
14779; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
14780; GFX8-NEXT:    v_bfe_u32 v17, v11, 16, 1
14781; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
14782; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14783; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14784; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14785; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
14786; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
14787; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
14788; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
14789; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
14790; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
14791; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
14792; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
14793; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14794; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
14795; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
14796; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
14797; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
14798; GFX8-NEXT:    v_bfe_u32 v17, v10, 16, 1
14799; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v10
14800; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14801; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14802; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14803; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
14804; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
14805; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
14806; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
14807; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
14808; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
14809; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
14810; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
14811; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
14812; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
14813; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
14814; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
14815; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
14816; GFX8-NEXT:    v_bfe_u32 v17, v9, 16, 1
14817; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v9
14818; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14819; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14820; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
14821; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
14822; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
14823; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
14824; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
14825; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
14826; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
14827; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
14828; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
14829; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
14830; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
14831; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
14832; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
14833; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14834; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
14835; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
14836; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14837; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14838; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
14839; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
14840; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
14841; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
14842; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
14843; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
14844; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
14845; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
14846; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
14847; GFX8-NEXT:    s_setpc_b64 s[30:31]
14848;
14849; GFX9-LABEL: v_fmul_v16bf16:
14850; GFX9:       ; %bb.0:
14851; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14852; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
14853; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
14854; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
14855; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14856; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14857; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
14858; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
14859; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
14860; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
14861; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
14862; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
14863; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
14864; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
14865; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
14866; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
14867; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
14868; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
14869; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
14870; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
14871; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
14872; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
14873; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
14874; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
14875; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
14876; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
14877; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
14878; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
14879; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
14880; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
14881; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
14882; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
14883; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
14884; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
14885; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
14886; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
14887; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
14888; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
14889; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
14890; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
14891; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
14892; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
14893; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
14894; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
14895; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
14896; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
14897; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
14898; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
14899; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14900; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
14901; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
14902; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
14903; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
14904; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
14905; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
14906; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
14907; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
14908; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
14909; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
14910; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
14911; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
14912; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
14913; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
14914; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
14915; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
14916; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
14917; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
14918; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
14919; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
14920; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
14921; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14922; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
14923; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
14924; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
14925; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
14926; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
14927; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
14928; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
14929; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
14930; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
14931; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14932; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
14933; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
14934; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
14935; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
14936; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
14937; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14938; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
14939; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
14940; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
14941; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
14942; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
14943; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
14944; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
14945; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
14946; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
14947; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14948; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
14949; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
14950; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
14951; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
14952; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
14953; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
14954; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
14955; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
14956; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
14957; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
14958; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
14959; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
14960; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
14961; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
14962; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
14963; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
14964; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
14965; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
14966; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
14967; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
14968; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
14969; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
14970; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
14971; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
14972; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
14973; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
14974; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
14975; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
14976; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
14977; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
14978; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
14979; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
14980; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
14981; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
14982; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
14983; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
14984; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
14985; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
14986; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
14987; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
14988; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
14989; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
14990; GFX9-NEXT:    s_setpc_b64 s[30:31]
14991;
14992; GFX10-LABEL: v_fmul_v16bf16:
14993; GFX10:       ; %bb.0:
14994; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14995; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
14996; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
14997; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
14998; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
14999; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
15000; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15001; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
15002; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
15003; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
15004; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15005; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
15006; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
15007; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
15008; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
15009; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
15010; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
15011; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
15012; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
15013; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
15014; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
15015; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
15016; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
15017; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
15018; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
15019; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
15020; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15021; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15022; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
15023; GFX10-NEXT:    v_bfe_u32 v18, v6, 16, 1
15024; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15025; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
15026; GFX10-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
15027; GFX10-NEXT:    v_mul_f32_e32 v17, v20, v19
15028; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
15029; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
15030; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
15031; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
15032; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
15033; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
15034; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
15035; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15036; GFX10-NEXT:    v_bfe_u32 v21, v5, 16, 1
15037; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15038; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15039; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
15040; GFX10-NEXT:    v_mul_f32_e32 v13, v19, v18
15041; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
15042; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
15043; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
15044; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
15045; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
15046; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
15047; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
15048; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15049; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15050; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
15051; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
15052; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
15053; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15054; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15055; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
15056; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15057; GFX10-NEXT:    v_mul_f32_e32 v12, v18, v12
15058; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
15059; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
15060; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
15061; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
15062; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
15063; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15064; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
15065; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
15066; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
15067; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
15068; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15069; GFX10-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
15070; GFX10-NEXT:    v_mul_f32_e32 v18, v19, v18
15071; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15072; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
15073; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
15074; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
15075; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
15076; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
15077; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15078; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
15079; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
15080; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
15081; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
15082; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
15083; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15084; GFX10-NEXT:    v_bfe_u32 v19, v2, 16, 1
15085; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
15086; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
15087; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15088; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
15089; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
15090; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
15091; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15092; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
15093; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v20
15094; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
15095; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
15096; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15097; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15098; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
15099; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
15100; GFX10-NEXT:    v_mul_f32_e32 v9, v22, v20
15101; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
15102; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
15103; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
15104; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
15105; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
15106; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
15107; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
15108; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
15109; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
15110; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15111; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
15112; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
15113; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
15114; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
15115; GFX10-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
15116; GFX10-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
15117; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15118; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
15119; GFX10-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
15120; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
15121; GFX10-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15122; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
15123; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15124; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
15125; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
15126; GFX10-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15127; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
15128; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
15129; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15130; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
15131; GFX10-NEXT:    s_setpc_b64 s[30:31]
15132;
15133; GFX11-LABEL: v_fmul_v16bf16:
15134; GFX11:       ; %bb.0:
15135; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15136; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
15137; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
15138; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15139; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
15140; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
15141; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
15142; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
15143; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15144; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
15145; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
15146; GFX11-NEXT:    v_mul_f32_e32 v17, v18, v17
15147; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v14
15148; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
15149; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15150; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
15151; GFX11-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
15152; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15153; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
15154; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
15155; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
15156; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
15157; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
15158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15159; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
15160; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
15161; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
15162; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
15163; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
15164; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15165; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15166; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
15167; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
15168; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
15169; GFX11-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
15170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
15171; GFX11-NEXT:    v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
15172; GFX11-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
15173; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
15174; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
15175; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15176; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15177; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15178; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
15179; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15180; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
15181; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v12
15182; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
15183; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15184; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
15185; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
15186; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15187; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
15188; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
15189; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
15190; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
15191; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
15192; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
15193; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15194; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
15195; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
15196; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
15197; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15198; GFX11-NEXT:    v_mul_f32_e32 v12, v18, v12
15199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15200; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
15201; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
15202; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
15203; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
15204; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15205; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15206; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
15207; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
15208; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
15209; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
15210; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
15211; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
15212; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15213; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
15214; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
15215; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
15216; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15217; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15218; GFX11-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
15219; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
15220; GFX11-NEXT:    v_mul_f32_e32 v18, v19, v18
15221; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15222; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
15223; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15224; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15225; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
15226; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
15227; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15228; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
15229; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
15230; GFX11-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
15231; GFX11-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
15232; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
15233; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
15234; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
15235; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
15236; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
15237; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15238; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15239; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
15240; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
15241; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
15242; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
15243; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
15244; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15245; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
15246; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v20
15247; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
15248; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
15249; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15250; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
15251; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
15252; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15253; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15254; GFX11-NEXT:    v_bfe_u32 v23, v19, 16, 1
15255; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9
15256; GFX11-NEXT:    v_mul_f32_e32 v9, v22, v20
15257; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15258; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
15259; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
15260; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
15261; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
15262; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
15263; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
15264; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15265; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
15266; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
15267; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
15268; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
15269; GFX11-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
15270; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15271; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15272; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
15273; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
15274; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15275; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
15276; GFX11-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15277; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
15278; GFX11-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15279; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
15280; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15281; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
15282; GFX11-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15283; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
15284; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
15285; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15287; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
15288; GFX11-NEXT:    s_setpc_b64 s[30:31]
15289  %op = fmul <16 x bfloat> %a, %b
15290  ret <16 x bfloat> %op
15291}
15292
15293define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
15294; GCN-LABEL: v_fmul_v32bf16:
15295; GCN:       ; %bb.0:
15296; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15297; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
15298; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
15299; GCN-NEXT:    s_waitcnt vmcnt(1)
15300; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
15301; GCN-NEXT:    s_waitcnt vmcnt(0)
15302; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15303; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15304; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
15305; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
15306; GCN-NEXT:    v_mul_f32_e32 v31, v31, v32
15307; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
15308; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15309; GCN-NEXT:    s_waitcnt vmcnt(0)
15310; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15311; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15312; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
15313; GCN-NEXT:    v_mul_f32_e32 v30, v30, v32
15314; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
15315; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
15316; GCN-NEXT:    s_waitcnt vmcnt(0)
15317; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15318; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15319; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
15320; GCN-NEXT:    v_mul_f32_e32 v29, v29, v32
15321; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
15322; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
15323; GCN-NEXT:    s_waitcnt vmcnt(0)
15324; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15325; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15326; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
15327; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
15328; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
15329; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
15330; GCN-NEXT:    s_waitcnt vmcnt(0)
15331; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15332; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15333; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
15334; GCN-NEXT:    v_mul_f32_e32 v27, v27, v32
15335; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
15336; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
15337; GCN-NEXT:    s_waitcnt vmcnt(0)
15338; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15339; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15340; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
15341; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
15342; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
15343; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
15344; GCN-NEXT:    s_waitcnt vmcnt(0)
15345; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15346; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15347; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
15348; GCN-NEXT:    v_mul_f32_e32 v25, v25, v32
15349; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
15350; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
15351; GCN-NEXT:    s_waitcnt vmcnt(0)
15352; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15353; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15354; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
15355; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
15356; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
15357; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
15358; GCN-NEXT:    s_waitcnt vmcnt(0)
15359; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15360; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15361; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
15362; GCN-NEXT:    v_mul_f32_e32 v23, v23, v32
15363; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
15364; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
15365; GCN-NEXT:    s_waitcnt vmcnt(0)
15366; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15367; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15368; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
15369; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
15370; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
15371; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
15372; GCN-NEXT:    s_waitcnt vmcnt(0)
15373; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15374; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15375; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
15376; GCN-NEXT:    v_mul_f32_e32 v21, v21, v32
15377; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
15378; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
15379; GCN-NEXT:    s_waitcnt vmcnt(0)
15380; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15381; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15382; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
15383; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
15384; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
15385; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
15386; GCN-NEXT:    s_waitcnt vmcnt(0)
15387; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15388; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15389; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
15390; GCN-NEXT:    v_mul_f32_e32 v19, v19, v32
15391; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
15392; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
15393; GCN-NEXT:    s_waitcnt vmcnt(0)
15394; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15395; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15396; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
15397; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
15398; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
15399; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
15400; GCN-NEXT:    s_waitcnt vmcnt(0)
15401; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15402; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15403; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
15404; GCN-NEXT:    v_mul_f32_e32 v17, v17, v32
15405; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
15406; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
15407; GCN-NEXT:    s_waitcnt vmcnt(0)
15408; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15409; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15410; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
15411; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
15412; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
15413; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
15414; GCN-NEXT:    s_waitcnt vmcnt(0)
15415; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15416; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15417; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
15418; GCN-NEXT:    v_mul_f32_e32 v15, v15, v32
15419; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
15420; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15421; GCN-NEXT:    s_waitcnt vmcnt(0)
15422; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15423; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15424; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
15425; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
15426; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
15427; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15428; GCN-NEXT:    s_waitcnt vmcnt(0)
15429; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15430; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15431; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
15432; GCN-NEXT:    v_mul_f32_e32 v13, v13, v32
15433; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
15434; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15435; GCN-NEXT:    s_waitcnt vmcnt(0)
15436; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15437; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15438; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
15439; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
15440; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
15441; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15442; GCN-NEXT:    s_waitcnt vmcnt(0)
15443; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15444; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15445; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
15446; GCN-NEXT:    v_mul_f32_e32 v11, v11, v32
15447; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
15448; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15449; GCN-NEXT:    s_waitcnt vmcnt(0)
15450; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15451; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15452; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
15453; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
15454; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
15455; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15456; GCN-NEXT:    s_waitcnt vmcnt(0)
15457; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15458; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15459; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
15460; GCN-NEXT:    v_mul_f32_e32 v9, v9, v32
15461; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
15462; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15463; GCN-NEXT:    s_waitcnt vmcnt(0)
15464; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15465; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15466; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
15467; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
15468; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
15469; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15470; GCN-NEXT:    s_waitcnt vmcnt(0)
15471; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15472; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15473; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
15474; GCN-NEXT:    v_mul_f32_e32 v7, v7, v32
15475; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
15476; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15477; GCN-NEXT:    s_waitcnt vmcnt(0)
15478; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15479; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15480; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
15481; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
15482; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15483; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15484; GCN-NEXT:    s_waitcnt vmcnt(0)
15485; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15486; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15487; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
15488; GCN-NEXT:    v_mul_f32_e32 v5, v5, v32
15489; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15490; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15491; GCN-NEXT:    s_waitcnt vmcnt(0)
15492; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15493; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15494; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
15495; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
15496; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15497; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15498; GCN-NEXT:    s_waitcnt vmcnt(0)
15499; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15500; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15501; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
15502; GCN-NEXT:    v_mul_f32_e32 v3, v3, v32
15503; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
15504; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15505; GCN-NEXT:    s_waitcnt vmcnt(0)
15506; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15507; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15508; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
15509; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
15510; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
15511; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
15512; GCN-NEXT:    s_waitcnt vmcnt(0)
15513; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15514; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15515; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
15516; GCN-NEXT:    v_mul_f32_e32 v1, v1, v32
15517; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
15518; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15519; GCN-NEXT:    s_waitcnt vmcnt(0)
15520; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
15521; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15522; GCN-NEXT:    v_mul_f32_e32 v0, v0, v32
15523; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15524; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
15525; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15526; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15527; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15528; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15529; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15530; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15531; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15532; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15533; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15534; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15535; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15536; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15537; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15538; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
15539; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
15540; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
15541; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
15542; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
15543; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
15544; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
15545; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
15546; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
15547; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
15548; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
15549; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
15550; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
15551; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
15552; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
15553; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15554; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
15555; GCN-NEXT:    s_setpc_b64 s[30:31]
15556;
15557; GFX7-LABEL: v_fmul_v32bf16:
15558; GFX7:       ; %bb.0:
15559; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15560; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
15561; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
15562; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
15563; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15564; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
15565; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
15566; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
15567; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
15568; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
15569; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
15570; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
15571; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
15572; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
15573; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
15574; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
15575; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
15576; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
15577; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
15578; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
15579; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
15580; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
15581; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
15582; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
15583; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
15584; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
15585; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
15586; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
15587; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
15588; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
15589; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
15590; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
15591; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
15592; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
15593; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
15594; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
15595; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15596; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
15597; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15598; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
15599; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15600; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
15601; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15602; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
15603; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15604; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
15605; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15606; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
15607; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15608; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
15609; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15610; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
15611; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15612; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15613; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15614; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15615; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15616; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15617; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15618; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
15619; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15620; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
15621; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
15622; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
15623; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15624; GFX7-NEXT:    s_waitcnt vmcnt(1)
15625; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
15626; GFX7-NEXT:    s_waitcnt vmcnt(0)
15627; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15628; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15629; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
15630; GFX7-NEXT:    v_mul_f32_e32 v31, v31, v32
15631; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
15632; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
15633; GFX7-NEXT:    s_waitcnt vmcnt(0)
15634; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15635; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15636; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v32
15637; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
15638; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15639; GFX7-NEXT:    s_waitcnt vmcnt(0)
15640; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15641; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15642; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v32
15643; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
15644; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
15645; GFX7-NEXT:    s_waitcnt vmcnt(0)
15646; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15647; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15648; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v32
15649; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
15650; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
15651; GFX7-NEXT:    s_waitcnt vmcnt(0)
15652; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15653; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15654; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v32
15655; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
15656; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
15657; GFX7-NEXT:    s_waitcnt vmcnt(0)
15658; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15659; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15660; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v32
15661; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
15662; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
15663; GFX7-NEXT:    s_waitcnt vmcnt(0)
15664; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15665; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15666; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v32
15667; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
15668; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
15669; GFX7-NEXT:    s_waitcnt vmcnt(0)
15670; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15671; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15672; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v32
15673; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
15674; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
15675; GFX7-NEXT:    s_waitcnt vmcnt(0)
15676; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15677; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15678; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v32
15679; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
15680; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
15681; GFX7-NEXT:    s_waitcnt vmcnt(0)
15682; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15683; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15684; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v32
15685; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
15686; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
15687; GFX7-NEXT:    s_waitcnt vmcnt(0)
15688; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15689; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15690; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v32
15691; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
15692; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
15693; GFX7-NEXT:    s_waitcnt vmcnt(0)
15694; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15695; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15696; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v32
15697; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
15698; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
15699; GFX7-NEXT:    s_waitcnt vmcnt(0)
15700; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15701; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15702; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v32
15703; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
15704; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
15705; GFX7-NEXT:    s_waitcnt vmcnt(0)
15706; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15707; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15708; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v32
15709; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
15710; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
15711; GFX7-NEXT:    s_waitcnt vmcnt(0)
15712; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15713; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15714; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v32
15715; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
15716; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
15717; GFX7-NEXT:    s_waitcnt vmcnt(0)
15718; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15719; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15720; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v32
15721; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
15722; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
15723; GFX7-NEXT:    s_waitcnt vmcnt(0)
15724; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15725; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15726; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v32
15727; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
15728; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
15729; GFX7-NEXT:    s_waitcnt vmcnt(0)
15730; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15731; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15732; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v32
15733; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
15734; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15735; GFX7-NEXT:    s_waitcnt vmcnt(0)
15736; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15737; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15738; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v32
15739; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
15740; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15741; GFX7-NEXT:    s_waitcnt vmcnt(0)
15742; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15743; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15744; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v32
15745; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
15746; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15747; GFX7-NEXT:    s_waitcnt vmcnt(0)
15748; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15749; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15750; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v32
15751; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
15752; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15753; GFX7-NEXT:    s_waitcnt vmcnt(0)
15754; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15755; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15756; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v32
15757; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
15758; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15759; GFX7-NEXT:    s_waitcnt vmcnt(0)
15760; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15761; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15762; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v32
15763; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
15764; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15765; GFX7-NEXT:    s_waitcnt vmcnt(0)
15766; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15767; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15768; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v32
15769; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
15770; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15771; GFX7-NEXT:    s_waitcnt vmcnt(0)
15772; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15773; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15774; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v32
15775; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
15776; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15777; GFX7-NEXT:    s_waitcnt vmcnt(0)
15778; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15779; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15780; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v32
15781; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
15782; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15783; GFX7-NEXT:    s_waitcnt vmcnt(0)
15784; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15785; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15786; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v32
15787; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
15788; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
15789; GFX7-NEXT:    s_waitcnt vmcnt(0)
15790; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15791; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15792; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v32
15793; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
15794; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
15795; GFX7-NEXT:    s_waitcnt vmcnt(0)
15796; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15797; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15798; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v32
15799; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
15800; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15801; GFX7-NEXT:    s_waitcnt vmcnt(0)
15802; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15803; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15804; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v32
15805; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
15806; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15807; GFX7-NEXT:    s_waitcnt vmcnt(0)
15808; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15809; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15810; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v32
15811; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
15812; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
15813; GFX7-NEXT:    s_waitcnt vmcnt(0)
15814; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
15815; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
15816; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v32
15817; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
15818; GFX7-NEXT:    s_setpc_b64 s[30:31]
15819;
15820; GFX8-LABEL: v_fmul_v32bf16:
15821; GFX8:       ; %bb.0:
15822; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15823; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
15824; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
15825; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
15826; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
15827; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
15828; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
15829; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15830; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
15831; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
15832; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
15833; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
15834; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
15835; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
15836; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
15837; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
15838; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
15839; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
15840; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
15841; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
15842; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
15843; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
15844; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v30
15845; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32
15846; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
15847; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
15848; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
15849; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
15850; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
15851; GFX8-NEXT:    v_bfe_u32 v29, v13, 16, 1
15852; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
15853; GFX8-NEXT:    v_alignbit_b32 v14, v14, v31, 16
15854; GFX8-NEXT:    s_waitcnt vmcnt(0)
15855; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
15856; GFX8-NEXT:    v_mul_f32_e32 v33, v33, v34
15857; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
15858; GFX8-NEXT:    v_mul_f32_e32 v30, v15, v30
15859; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
15860; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
15861; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
15862; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
15863; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
15864; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
15865; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
15866; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
15867; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15868; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
15869; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
15870; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
15871; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
15872; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
15873; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15874; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
15875; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
15876; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
15877; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
15878; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
15879; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
15880; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
15881; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
15882; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
15883; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
15884; GFX8-NEXT:    v_mul_f32_e32 v29, v33, v29
15885; GFX8-NEXT:    v_bfe_u32 v33, v29, 16, 1
15886; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
15887; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
15888; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
15889; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15890; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
15891; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
15892; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
15893; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
15894; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
15895; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
15896; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
15897; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
15898; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
15899; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
15900; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
15901; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
15902; GFX8-NEXT:    v_mul_f32_e32 v28, v33, v28
15903; GFX8-NEXT:    v_bfe_u32 v33, v28, 16, 1
15904; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
15905; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
15906; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
15907; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15908; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
15909; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
15910; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
15911; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
15912; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
15913; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
15914; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
15915; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
15916; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
15917; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
15918; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
15919; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
15920; GFX8-NEXT:    v_mul_f32_e32 v27, v33, v27
15921; GFX8-NEXT:    v_bfe_u32 v33, v27, 16, 1
15922; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
15923; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
15924; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
15925; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15926; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
15927; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
15928; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
15929; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
15930; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
15931; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
15932; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
15933; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
15934; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
15935; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
15936; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
15937; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
15938; GFX8-NEXT:    v_mul_f32_e32 v26, v33, v26
15939; GFX8-NEXT:    v_bfe_u32 v33, v26, 16, 1
15940; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
15941; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
15942; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
15943; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15944; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
15945; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
15946; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
15947; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
15948; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
15949; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
15950; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
15951; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
15952; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
15953; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
15954; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
15955; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
15956; GFX8-NEXT:    v_mul_f32_e32 v25, v33, v25
15957; GFX8-NEXT:    v_bfe_u32 v33, v25, 16, 1
15958; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
15959; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
15960; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
15961; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15962; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
15963; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
15964; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
15965; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
15966; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
15967; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
15968; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
15969; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
15970; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
15971; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
15972; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
15973; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
15974; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
15975; GFX8-NEXT:    v_bfe_u32 v33, v24, 16, 1
15976; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
15977; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
15978; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
15979; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15980; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
15981; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
15982; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
15983; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
15984; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
15985; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
15986; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
15987; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
15988; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
15989; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
15990; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
15991; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
15992; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
15993; GFX8-NEXT:    v_bfe_u32 v33, v23, 16, 1
15994; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
15995; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
15996; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
15997; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
15998; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
15999; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
16000; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
16001; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
16002; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
16003; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
16004; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
16005; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
16006; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16007; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
16008; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
16009; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
16010; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
16011; GFX8-NEXT:    v_bfe_u32 v33, v22, 16, 1
16012; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
16013; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
16014; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
16015; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16016; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
16017; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
16018; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
16019; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
16020; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
16021; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
16022; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
16023; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
16024; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16025; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
16026; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
16027; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
16028; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
16029; GFX8-NEXT:    v_bfe_u32 v33, v21, 16, 1
16030; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
16031; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
16032; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
16033; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16034; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
16035; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
16036; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
16037; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
16038; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
16039; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
16040; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
16041; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
16042; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
16043; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
16044; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
16045; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
16046; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
16047; GFX8-NEXT:    v_bfe_u32 v33, v20, 16, 1
16048; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
16049; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
16050; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16051; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16052; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
16053; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
16054; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
16055; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
16056; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
16057; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
16058; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
16059; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
16060; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
16061; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
16062; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
16063; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
16064; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
16065; GFX8-NEXT:    v_bfe_u32 v33, v19, 16, 1
16066; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
16067; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
16068; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16069; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16070; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
16071; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
16072; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
16073; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
16074; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
16075; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
16076; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
16077; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
16078; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
16079; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
16080; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
16081; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
16082; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
16083; GFX8-NEXT:    v_bfe_u32 v33, v18, 16, 1
16084; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
16085; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
16086; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
16087; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16088; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
16089; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
16090; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
16091; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
16092; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
16093; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
16094; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
16095; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
16096; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
16097; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
16098; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
16099; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
16100; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
16101; GFX8-NEXT:    v_bfe_u32 v33, v17, 16, 1
16102; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
16103; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
16104; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
16105; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
16106; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
16107; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
16108; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
16109; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
16110; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
16111; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
16112; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
16113; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
16114; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
16115; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
16116; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
16117; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
16118; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
16119; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
16120; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
16121; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16122; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
16123; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
16124; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
16125; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
16126; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
16127; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
16128; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
16129; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
16130; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
16131; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
16132; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
16133; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
16134; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
16135; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
16136; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
16137; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
16138; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
16139; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
16140; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
16141; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
16142; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
16143; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
16144; GFX8-NEXT:    v_alignbit_b32 v13, v13, v32, 16
16145; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
16146; GFX8-NEXT:    s_setpc_b64 s[30:31]
16147;
16148; GFX9-LABEL: v_fmul_v32bf16:
16149; GFX9:       ; %bb.0:
16150; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16151; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
16152; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
16153; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
16154; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
16155; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
16156; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
16157; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
16158; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
16159; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
16160; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
16161; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
16162; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
16163; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
16164; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
16165; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
16166; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
16167; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
16168; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
16169; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
16170; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
16171; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
16172; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
16173; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
16174; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
16175; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
16176; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
16177; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
16178; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
16179; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
16180; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
16181; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
16182; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
16183; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
16184; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
16185; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
16186; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v29
16187; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
16188; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
16189; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
16190; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
16191; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
16192; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
16193; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
16194; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
16195; GFX9-NEXT:    s_waitcnt vmcnt(0)
16196; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
16197; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v34
16198; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
16199; GFX9-NEXT:    v_mul_f32_e32 v29, v15, v29
16200; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
16201; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
16202; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
16203; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
16204; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
16205; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
16206; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
16207; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
16208; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
16209; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
16210; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
16211; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
16212; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
16213; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
16214; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
16215; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
16216; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
16217; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
16218; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
16219; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
16220; GFX9-NEXT:    v_mul_f32_e32 v28, v33, v28
16221; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
16222; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
16223; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
16224; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
16225; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
16226; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
16227; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
16228; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
16229; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
16230; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
16231; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
16232; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
16233; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
16234; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
16235; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
16236; GFX9-NEXT:    v_mul_f32_e32 v27, v33, v27
16237; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
16238; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
16239; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
16240; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
16241; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
16242; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
16243; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
16244; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
16245; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
16246; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
16247; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
16248; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
16249; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
16250; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
16251; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
16252; GFX9-NEXT:    v_mul_f32_e32 v26, v33, v26
16253; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
16254; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
16255; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
16256; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
16257; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
16258; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
16259; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
16260; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
16261; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
16262; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
16263; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
16264; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
16265; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
16266; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
16267; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
16268; GFX9-NEXT:    v_mul_f32_e32 v25, v33, v25
16269; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
16270; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
16271; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
16272; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
16273; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
16274; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
16275; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
16276; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
16277; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
16278; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
16279; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
16280; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
16281; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
16282; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
16283; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
16284; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
16285; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
16286; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
16287; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
16288; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
16289; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
16290; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
16291; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
16292; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
16293; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
16294; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
16295; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
16296; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
16297; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
16298; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
16299; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
16300; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
16301; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
16302; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
16303; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
16304; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
16305; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
16306; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
16307; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
16308; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
16309; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
16310; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
16311; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
16312; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16313; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
16314; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
16315; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
16316; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
16317; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
16318; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
16319; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
16320; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
16321; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
16322; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
16323; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
16324; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
16325; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
16326; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
16327; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
16328; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16329; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
16330; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
16331; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
16332; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
16333; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
16334; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
16335; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
16336; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
16337; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
16338; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
16339; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
16340; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
16341; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
16342; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
16343; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
16344; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
16345; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
16346; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
16347; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
16348; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
16349; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
16350; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16351; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
16352; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
16353; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
16354; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
16355; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
16356; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
16357; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
16358; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
16359; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
16360; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
16361; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
16362; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
16363; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
16364; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
16365; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
16366; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16367; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
16368; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
16369; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
16370; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
16371; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
16372; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
16373; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
16374; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
16375; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
16376; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
16377; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
16378; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
16379; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
16380; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
16381; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
16382; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
16383; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
16384; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
16385; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
16386; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
16387; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
16388; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
16389; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
16390; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
16391; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
16392; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
16393; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
16394; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
16395; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
16396; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
16397; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
16398; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
16399; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
16400; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
16401; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
16402; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
16403; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
16404; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
16405; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
16406; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
16407; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
16408; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
16409; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
16410; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
16411; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
16412; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
16413; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
16414; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
16415; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
16416; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
16417; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
16418; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
16419; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
16420; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
16421; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
16422; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
16423; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
16424; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
16425; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
16426; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
16427; GFX9-NEXT:    s_setpc_b64 s[30:31]
16428;
16429; GFX10-LABEL: v_fmul_v32bf16:
16430; GFX10:       ; %bb.0:
16431; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16432; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
16433; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
16434; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
16435; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
16436; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
16437; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
16438; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
16439; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
16440; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
16441; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
16442; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
16443; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
16444; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
16445; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
16446; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
16447; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
16448; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
16449; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
16450; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
16451; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
16452; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
16453; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
16454; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v6
16455; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
16456; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
16457; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
16458; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
16459; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
16460; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
16461; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
16462; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
16463; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
16464; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
16465; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
16466; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
16467; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
16468; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
16469; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
16470; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
16471; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
16472; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
16473; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
16474; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
16475; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
16476; GFX10-NEXT:    v_mul_f32_e32 v27, v50, v27
16477; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v0
16478; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
16479; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
16480; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
16481; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
16482; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
16483; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
16484; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
16485; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
16486; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
16487; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v18
16488; GFX10-NEXT:    v_mul_f32_e32 v29, v38, v29
16489; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v2
16490; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
16491; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16492; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
16493; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
16494; GFX10-NEXT:    v_mul_f32_e32 v28, v48, v28
16495; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
16496; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
16497; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
16498; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
16499; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
16500; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
16501; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
16502; GFX10-NEXT:    v_mul_f32_e32 v34, v34, v51
16503; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
16504; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
16505; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
16506; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
16507; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
16508; GFX10-NEXT:    v_mul_f32_e32 v30, v36, v30
16509; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v3
16510; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
16511; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16512; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
16513; GFX10-NEXT:    v_mul_f32_e32 v18, v48, v23
16514; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
16515; GFX10-NEXT:    v_mul_f32_e32 v17, v50, v22
16516; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v33
16517; GFX10-NEXT:    v_bfe_u32 v23, v14, 16, 1
16518; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
16519; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
16520; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
16521; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
16522; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
16523; GFX10-NEXT:    v_mul_f32_e32 v20, v36, v25
16524; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
16525; GFX10-NEXT:    v_mul_f32_e32 v19, v38, v24
16526; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v14
16527; GFX10-NEXT:    v_bfe_u32 v25, v35, 16, 1
16528; GFX10-NEXT:    v_add3_u32 v23, v23, v14, 0x7fff
16529; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
16530; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
16531; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
16532; GFX10-NEXT:    v_mul_f32_e32 v21, v51, v26
16533; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v35
16534; GFX10-NEXT:    v_bfe_u32 v36, v13, 16, 1
16535; GFX10-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
16536; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
16537; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
16538; GFX10-NEXT:    v_or_b32_e32 v38, 0x400000, v13
16539; GFX10-NEXT:    v_bfe_u32 v48, v37, 16, 1
16540; GFX10-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
16541; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v37
16542; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
16543; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
16544; GFX10-NEXT:    v_bfe_u32 v51, v12, 16, 1
16545; GFX10-NEXT:    v_add3_u32 v48, v48, v37, 0x7fff
16546; GFX10-NEXT:    v_or_b32_e32 v33, 0x400000, v12
16547; GFX10-NEXT:    v_bfe_u32 v22, v39, 16, 1
16548; GFX10-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc_lo
16549; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
16550; GFX10-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
16551; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v39
16552; GFX10-NEXT:    v_bfe_u32 v24, v11, 16, 1
16553; GFX10-NEXT:    v_add3_u32 v22, v22, v39, 0x7fff
16554; GFX10-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc_lo
16555; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
16556; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v11
16557; GFX10-NEXT:    v_bfe_u32 v26, v49, 16, 1
16558; GFX10-NEXT:    v_add3_u32 v24, v24, v11, 0x7fff
16559; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v49
16560; GFX10-NEXT:    v_cndmask_b32_e32 v33, v51, v33, vcc_lo
16561; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
16562; GFX10-NEXT:    v_bfe_u32 v38, v10, 16, 1
16563; GFX10-NEXT:    v_add3_u32 v26, v26, v49, 0x7fff
16564; GFX10-NEXT:    v_or_b32_e32 v37, 0x400000, v10
16565; GFX10-NEXT:    v_bfe_u32 v50, v34, 16, 1
16566; GFX10-NEXT:    v_cndmask_b32_e32 v14, v22, v14, vcc_lo
16567; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
16568; GFX10-NEXT:    v_add3_u32 v38, v38, v10, 0x7fff
16569; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v34
16570; GFX10-NEXT:    v_bfe_u32 v51, v9, 16, 1
16571; GFX10-NEXT:    v_add3_u32 v50, v50, v34, 0x7fff
16572; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v35, vcc_lo
16573; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
16574; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v9
16575; GFX10-NEXT:    v_bfe_u32 v22, v30, 16, 1
16576; GFX10-NEXT:    v_add3_u32 v51, v51, v9, 0x7fff
16577; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v30
16578; GFX10-NEXT:    v_cndmask_b32_e32 v13, v26, v13, vcc_lo
16579; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
16580; GFX10-NEXT:    v_bfe_u32 v35, v8, 16, 1
16581; GFX10-NEXT:    v_add3_u32 v22, v22, v30, 0x7fff
16582; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v8
16583; GFX10-NEXT:    v_bfe_u32 v26, v29, 16, 1
16584; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
16585; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
16586; GFX10-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
16587; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v29
16588; GFX10-NEXT:    v_bfe_u32 v38, v7, 16, 1
16589; GFX10-NEXT:    v_add3_u32 v26, v26, v29, 0x7fff
16590; GFX10-NEXT:    v_cndmask_b32_e32 v12, v50, v12, vcc_lo
16591; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
16592; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v7
16593; GFX10-NEXT:    v_bfe_u32 v50, v28, 16, 1
16594; GFX10-NEXT:    v_add3_u32 v38, v38, v7, 0x7fff
16595; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v28
16596; GFX10-NEXT:    v_cndmask_b32_e32 v39, v51, v39, vcc_lo
16597; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
16598; GFX10-NEXT:    v_bfe_u32 v51, v6, 16, 1
16599; GFX10-NEXT:    v_add3_u32 v50, v50, v28, 0x7fff
16600; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v6
16601; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
16602; GFX10-NEXT:    v_cndmask_b32_e32 v11, v22, v11, vcc_lo
16603; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
16604; GFX10-NEXT:    v_bfe_u32 v22, v27, 16, 1
16605; GFX10-NEXT:    v_add3_u32 v51, v51, v6, 0x7fff
16606; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v27
16607; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
16608; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v49, vcc_lo
16609; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
16610; GFX10-NEXT:    v_bfe_u32 v49, v5, 16, 1
16611; GFX10-NEXT:    v_add3_u32 v22, v22, v27, 0x7fff
16612; GFX10-NEXT:    v_or_b32_e32 v29, 0x400000, v5
16613; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
16614; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
16615; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
16616; GFX10-NEXT:    v_add3_u32 v49, v49, v5, 0x7fff
16617; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v21
16618; GFX10-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
16619; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
16620; GFX10-NEXT:    v_bfe_u32 v38, v4, 16, 1
16621; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
16622; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v4
16623; GFX10-NEXT:    v_cndmask_b32_e32 v9, v50, v9, vcc_lo
16624; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16625; GFX10-NEXT:    v_bfe_u32 v50, v20, 16, 1
16626; GFX10-NEXT:    v_add3_u32 v38, v38, v4, 0x7fff
16627; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v20
16628; GFX10-NEXT:    v_cndmask_b32_e32 v30, v51, v30, vcc_lo
16629; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
16630; GFX10-NEXT:    v_add3_u32 v50, v50, v20, 0x7fff
16631; GFX10-NEXT:    v_bfe_u32 v51, v3, 16, 1
16632; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v3
16633; GFX10-NEXT:    v_cndmask_b32_e32 v8, v22, v8, vcc_lo
16634; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16635; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
16636; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v19
16637; GFX10-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
16638; GFX10-NEXT:    v_cndmask_b32_e32 v29, v49, v29, vcc_lo
16639; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
16640; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
16641; GFX10-NEXT:    v_bfe_u32 v49, v2, 16, 1
16642; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v2
16643; GFX10-NEXT:    v_cndmask_b32_e32 v7, v26, v7, vcc_lo
16644; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
16645; GFX10-NEXT:    v_bfe_u32 v26, v18, 16, 1
16646; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v18
16647; GFX10-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
16648; GFX10-NEXT:    v_cndmask_b32_e32 v28, v38, v28, vcc_lo
16649; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
16650; GFX10-NEXT:    v_bfe_u32 v38, v1, 16, 1
16651; GFX10-NEXT:    v_add3_u32 v26, v26, v18, 0x7fff
16652; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v1
16653; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
16654; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
16655; GFX10-NEXT:    v_bfe_u32 v50, v17, 16, 1
16656; GFX10-NEXT:    v_add3_u32 v38, v38, v1, 0x7fff
16657; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v17
16658; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
16659; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
16660; GFX10-NEXT:    v_bfe_u32 v22, v0, 16, 1
16661; GFX10-NEXT:    v_add3_u32 v50, v50, v17, 0x7fff
16662; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v0
16663; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v4, vcc_lo
16664; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
16665; GFX10-NEXT:    v_add3_u32 v22, v22, v0, 0x7fff
16666; GFX10-NEXT:    v_cndmask_b32_e32 v1, v38, v20, vcc_lo
16667; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
16668; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
16669; GFX10-NEXT:    v_cndmask_b32_e32 v17, v50, v19, vcc_lo
16670; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
16671; GFX10-NEXT:    v_perm_b32 v4, v28, v7, 0x7060302
16672; GFX10-NEXT:    v_perm_b32 v7, v34, v10, 0x7060302
16673; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v18, vcc_lo
16674; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
16675; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
16676; GFX10-NEXT:    v_cndmask_b32_e32 v2, v49, v21, vcc_lo
16677; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
16678; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
16679; GFX10-NEXT:    v_cndmask_b32_e32 v3, v51, v27, vcc_lo
16680; GFX10-NEXT:    v_perm_b32 v5, v29, v8, 0x7060302
16681; GFX10-NEXT:    v_perm_b32 v8, v35, v11, 0x7060302
16682; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
16683; GFX10-NEXT:    v_perm_b32 v6, v30, v9, 0x7060302
16684; GFX10-NEXT:    v_perm_b32 v9, v39, v12, 0x7060302
16685; GFX10-NEXT:    s_waitcnt vmcnt(0)
16686; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
16687; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
16688; GFX10-NEXT:    v_mul_f32_e32 v17, v31, v17
16689; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v18
16690; GFX10-NEXT:    v_bfe_u32 v10, v17, 16, 1
16691; GFX10-NEXT:    v_bfe_u32 v11, v15, 16, 1
16692; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v17
16693; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
16694; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v15
16695; GFX10-NEXT:    v_add3_u32 v18, v10, v17, 0x7fff
16696; GFX10-NEXT:    v_add3_u32 v11, v11, v15, 0x7fff
16697; GFX10-NEXT:    v_perm_b32 v10, v37, v13, 0x7060302
16698; GFX10-NEXT:    v_perm_b32 v13, v36, v25, 0x7060302
16699; GFX10-NEXT:    v_cndmask_b32_e32 v17, v18, v12, vcc_lo
16700; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
16701; GFX10-NEXT:    v_perm_b32 v12, v33, v48, 0x7060302
16702; GFX10-NEXT:    v_cndmask_b32_e32 v15, v11, v19, vcc_lo
16703; GFX10-NEXT:    v_perm_b32 v11, v24, v14, 0x7060302
16704; GFX10-NEXT:    v_perm_b32 v14, v23, v16, 0x7060302
16705; GFX10-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
16706; GFX10-NEXT:    s_setpc_b64 s[30:31]
16707;
16708; GFX11-LABEL: v_fmul_v32bf16:
16709; GFX11:       ; %bb.0:
16710; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16711; GFX11-NEXT:    scratch_load_b32 v32, off, s32
16712; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
16713; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
16714; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
16715; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
16716; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
16717; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
16718; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
16719; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
16720; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
16721; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
16722; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
16723; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16724; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
16725; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
16726; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
16727; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
16728; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
16729; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
16730; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
16731; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
16732; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
16733; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
16734; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16735; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
16736; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
16737; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
16738; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16739; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
16740; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
16741; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
16742; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
16743; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
16744; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
16745; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
16746; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
16747; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
16748; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
16749; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
16750; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
16751; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
16752; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
16753; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
16754; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
16755; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
16756; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16757; GFX11-NEXT:    v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
16758; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
16759; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
16760; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
16761; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
16762; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
16763; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
16764; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
16765; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
16766; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
16767; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
16768; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
16769; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
16770; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
16771; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
16772; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
16773; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
16774; GFX11-NEXT:    v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
16775; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
16776; GFX11-NEXT:    v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
16777; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
16778; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
16779; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
16780; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
16781; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
16782; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
16783; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
16784; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
16785; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
16786; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
16787; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
16788; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
16789; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
16790; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
16791; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
16792; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
16793; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
16794; GFX11-NEXT:    v_mul_f32_e32 v20, v80, v71
16795; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
16796; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
16797; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
16798; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
16799; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16800; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
16801; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
16802; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
16803; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
16804; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
16805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
16806; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
16807; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
16808; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
16809; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
16810; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
16811; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
16812; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
16813; GFX11-NEXT:    v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
16814; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
16815; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16816; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
16817; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
16818; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
16819; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
16820; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
16821; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
16822; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
16823; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
16824; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
16825; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
16826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
16827; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
16828; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
16829; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
16830; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
16831; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
16832; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
16833; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
16834; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
16835; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
16836; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
16837; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
16838; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
16839; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
16840; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
16841; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
16842; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
16843; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
16844; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
16845; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
16846; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
16847; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
16848; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
16849; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
16850; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
16851; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
16852; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
16853; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
16854; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
16855; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
16856; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
16857; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
16858; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
16859; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
16860; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
16861; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
16862; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
16863; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
16864; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
16865; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
16866; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
16867; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
16868; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
16869; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
16870; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
16871; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
16872; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
16873; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
16874; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
16875; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
16876; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
16877; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
16878; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
16879; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
16880; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
16881; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
16882; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
16883; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
16884; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
16885; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
16886; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
16887; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
16888; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
16889; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
16890; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
16891; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
16892; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
16893; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
16894; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
16895; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
16896; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
16897; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
16898; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
16899; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
16900; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
16901; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
16902; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
16903; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
16904; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
16905; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
16906; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
16907; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
16908; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
16909; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
16910; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
16911; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
16912; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
16913; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
16914; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
16915; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
16916; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
16917; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
16918; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
16919; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
16920; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
16921; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
16922; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
16923; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
16924; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
16925; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
16926; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
16927; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16928; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
16929; GFX11-NEXT:    v_cndmask_b32_e32 v23, v97, v98, vcc_lo
16930; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16931; GFX11-NEXT:    v_cndmask_b32_e32 v6, v99, v100, vcc_lo
16932; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
16933; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
16934; GFX11-NEXT:    v_cndmask_b32_e32 v22, v101, v102, vcc_lo
16935; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16936; GFX11-NEXT:    v_cndmask_b32_e32 v5, v103, v112, vcc_lo
16937; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
16938; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16939; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
16940; GFX11-NEXT:    v_cndmask_b32_e32 v21, v113, v114, vcc_lo
16941; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
16942; GFX11-NEXT:    v_cndmask_b32_e32 v4, v115, v116, vcc_lo
16943; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
16944; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
16945; GFX11-NEXT:    v_cndmask_b32_e32 v20, v117, v118, vcc_lo
16946; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
16947; GFX11-NEXT:    v_cndmask_b32_e32 v19, v129, v130, vcc_lo
16948; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
16949; GFX11-NEXT:    v_cndmask_b32_e32 v18, v133, v134, vcc_lo
16950; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
16951; GFX11-NEXT:    v_cndmask_b32_e32 v1, v135, v144, vcc_lo
16952; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
16953; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16954; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
16955; GFX11-NEXT:    v_cndmask_b32_e32 v17, v145, v146, vcc_lo
16956; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
16957; GFX11-NEXT:    v_cndmask_b32_e32 v0, v147, v33, vcc_lo
16958; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
16959; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
16960; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
16961; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
16962; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
16963; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
16964; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
16965; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
16966; GFX11-NEXT:    s_waitcnt vmcnt(0)
16967; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
16968; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16969; GFX11-NEXT:    v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
16970; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v18
16971; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16972; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
16973; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
16974; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
16975; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
16976; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
16977; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
16978; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
16979; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16980; GFX11-NEXT:    v_cndmask_b32_e32 v17, v18, v20, vcc_lo
16981; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
16982; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
16983; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16984; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
16985; GFX11-NEXT:    s_setpc_b64 s[30:31]
16986  %op = fmul <32 x bfloat> %a, %b
16987  ret <32 x bfloat> %op
16988}
16989
16990define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
16991; GCN-LABEL: v_fdiv_bf16:
16992; GCN:       ; %bb.0:
16993; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16994; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
16995; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
16996; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
16997; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
16998; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
16999; GCN-NEXT:    v_rcp_f32_e32 v3, v2
17000; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
17001; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
17002; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
17003; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
17004; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
17005; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
17006; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
17007; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
17008; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17009; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17010; GCN-NEXT:    s_setpc_b64 s[30:31]
17011;
17012; GFX7-LABEL: v_fdiv_bf16:
17013; GFX7:       ; %bb.0:
17014; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17015; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17016; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17017; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17018; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17019; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
17020; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
17021; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
17022; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
17023; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
17024; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
17025; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
17026; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
17027; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
17028; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
17029; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17030; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17031; GFX7-NEXT:    s_setpc_b64 s[30:31]
17032;
17033; GFX8-LABEL: v_fdiv_bf16:
17034; GFX8:       ; %bb.0:
17035; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17036; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17037; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17038; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
17039; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
17040; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
17041; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
17042; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
17043; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
17044; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
17045; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
17046; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
17047; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
17048; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17049; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
17050; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
17051; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
17052; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17053; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17054; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
17055; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17056; GFX8-NEXT:    s_setpc_b64 s[30:31]
17057;
17058; GFX9-LABEL: v_fdiv_bf16:
17059; GFX9:       ; %bb.0:
17060; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17061; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17062; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17063; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
17064; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
17065; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
17066; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
17067; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
17068; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
17069; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
17070; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
17071; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
17072; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
17073; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
17074; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17075; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
17076; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17077; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
17078; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17079; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
17080; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17081; GFX9-NEXT:    s_setpc_b64 s[30:31]
17082;
17083; GFX10-LABEL: v_fdiv_bf16:
17084; GFX10:       ; %bb.0:
17085; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17086; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17087; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17088; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
17089; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17090; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
17091; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
17092; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
17093; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v3
17094; GFX10-NEXT:    v_fma_f32 v6, -v2, v4, v5
17095; GFX10-NEXT:    v_fmac_f32_e32 v4, v6, v3
17096; GFX10-NEXT:    v_fma_f32 v2, -v2, v4, v5
17097; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
17098; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17099; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
17100; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17101; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17102; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
17103; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17104; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17105; GFX10-NEXT:    s_setpc_b64 s[30:31]
17106;
17107; GFX11-LABEL: v_fdiv_bf16:
17108; GFX11:       ; %bb.0:
17109; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17110; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17111; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17113; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
17114; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
17115; GFX11-NEXT:    s_waitcnt_depctr 0xfff
17116; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
17117; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
17118; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
17119; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17120; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v3
17121; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17122; GFX11-NEXT:    v_fma_f32 v6, -v2, v4, v5
17123; GFX11-NEXT:    v_fmac_f32_e32 v4, v6, v3
17124; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17125; GFX11-NEXT:    v_fma_f32 v2, -v2, v4, v5
17126; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
17127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17128; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
17129; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
17130; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17131; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17132; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17133; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
17134; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17135; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17136; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17137; GFX11-NEXT:    s_setpc_b64 s[30:31]
17138  %op = fdiv bfloat %a, %b
17139  ret bfloat %op
17140}
17141
17142declare bfloat @llvm.fabs.bf16(bfloat)
17143
17144define bfloat @v_fabs_bf16(bfloat %a) {
17145; GCN-LABEL: v_fabs_bf16:
17146; GCN:       ; %bb.0:
17147; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17148; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17149; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17150; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
17151; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17152; GCN-NEXT:    s_setpc_b64 s[30:31]
17153;
17154; GFX7-LABEL: v_fabs_bf16:
17155; GFX7:       ; %bb.0:
17156; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17157; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17158; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17159; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
17160; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17161; GFX7-NEXT:    s_setpc_b64 s[30:31]
17162;
17163; GFX8-LABEL: v_fabs_bf16:
17164; GFX8:       ; %bb.0:
17165; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17166; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
17167; GFX8-NEXT:    s_setpc_b64 s[30:31]
17168;
17169; GFX9-LABEL: v_fabs_bf16:
17170; GFX9:       ; %bb.0:
17171; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17172; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
17173; GFX9-NEXT:    s_setpc_b64 s[30:31]
17174;
17175; GFX10-LABEL: v_fabs_bf16:
17176; GFX10:       ; %bb.0:
17177; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17178; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
17179; GFX10-NEXT:    s_setpc_b64 s[30:31]
17180;
17181; GFX11TRUE16-LABEL: v_fabs_bf16:
17182; GFX11TRUE16:       ; %bb.0:
17183; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17184; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
17185; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
17186;
17187; GFX11FAKE16-LABEL: v_fabs_bf16:
17188; GFX11FAKE16:       ; %bb.0:
17189; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17190; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
17191; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
17192  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17193  ret bfloat %op
17194}
17195
17196define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
17197; GCN-LABEL: s_fabs_bf16:
17198; GCN:       ; %bb.0:
17199; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
17200; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
17201; GCN-NEXT:    v_readfirstlane_b32 s0, v0
17202; GCN-NEXT:    ; return to shader part epilog
17203;
17204; GFX7-LABEL: s_fabs_bf16:
17205; GFX7:       ; %bb.0:
17206; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
17207; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
17208; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
17209; GFX7-NEXT:    ; return to shader part epilog
17210;
17211; GFX8-LABEL: s_fabs_bf16:
17212; GFX8:       ; %bb.0:
17213; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
17214; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
17215; GFX8-NEXT:    ; return to shader part epilog
17216;
17217; GFX9-LABEL: s_fabs_bf16:
17218; GFX9:       ; %bb.0:
17219; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
17220; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
17221; GFX9-NEXT:    ; return to shader part epilog
17222;
17223; GFX10-LABEL: s_fabs_bf16:
17224; GFX10:       ; %bb.0:
17225; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
17226; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
17227; GFX10-NEXT:    ; return to shader part epilog
17228;
17229; GFX11-LABEL: s_fabs_bf16:
17230; GFX11:       ; %bb.0:
17231; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
17232; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17233; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
17234; GFX11-NEXT:    ; return to shader part epilog
17235  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17236  %cast = bitcast bfloat %op to i16
17237  %zext = zext i16 %cast to i32
17238  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17239  ret i32 %readlane
17240}
17241
17242define bfloat @v_fneg_bf16(bfloat %a) {
17243; GCN-LABEL: v_fneg_bf16:
17244; GCN:       ; %bb.0:
17245; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17246; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
17247; GCN-NEXT:    s_setpc_b64 s[30:31]
17248;
17249; GFX7-LABEL: v_fneg_bf16:
17250; GFX7:       ; %bb.0:
17251; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17252; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
17253; GFX7-NEXT:    s_setpc_b64 s[30:31]
17254;
17255; GFX8-LABEL: v_fneg_bf16:
17256; GFX8:       ; %bb.0:
17257; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17258; GFX8-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
17259; GFX8-NEXT:    s_setpc_b64 s[30:31]
17260;
17261; GFX9-LABEL: v_fneg_bf16:
17262; GFX9:       ; %bb.0:
17263; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17264; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
17265; GFX9-NEXT:    s_setpc_b64 s[30:31]
17266;
17267; GFX10-LABEL: v_fneg_bf16:
17268; GFX10:       ; %bb.0:
17269; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17270; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
17271; GFX10-NEXT:    s_setpc_b64 s[30:31]
17272;
17273; GFX11TRUE16-LABEL: v_fneg_bf16:
17274; GFX11TRUE16:       ; %bb.0:
17275; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17276; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
17277; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
17278;
17279; GFX11FAKE16-LABEL: v_fneg_bf16:
17280; GFX11FAKE16:       ; %bb.0:
17281; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17282; GFX11FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
17283; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
17284  %op = fneg bfloat %a
17285  ret bfloat %op
17286}
17287
17288declare i32 @llvm.amdgcn.readfirstlane(i32)
17289
17290; FIXME: readfirstlane hack for other bugs
17291define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
17292; GCN-LABEL: s_fneg_bf16:
17293; GCN:       ; %bb.0:
17294; GCN-NEXT:    v_mul_f32_e64 v0, -1.0, s0
17295; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17296; GCN-NEXT:    v_readfirstlane_b32 s0, v0
17297; GCN-NEXT:    ; return to shader part epilog
17298;
17299; GFX7-LABEL: s_fneg_bf16:
17300; GFX7:       ; %bb.0:
17301; GFX7-NEXT:    v_mul_f32_e64 v0, -1.0, s0
17302; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17303; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
17304; GFX7-NEXT:    ; return to shader part epilog
17305;
17306; GFX8-LABEL: s_fneg_bf16:
17307; GFX8:       ; %bb.0:
17308; GFX8-NEXT:    s_xor_b32 s0, s0, 0x8000
17309; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
17310; GFX8-NEXT:    ; return to shader part epilog
17311;
17312; GFX9-LABEL: s_fneg_bf16:
17313; GFX9:       ; %bb.0:
17314; GFX9-NEXT:    s_xor_b32 s0, s0, 0x8000
17315; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
17316; GFX9-NEXT:    ; return to shader part epilog
17317;
17318; GFX10-LABEL: s_fneg_bf16:
17319; GFX10:       ; %bb.0:
17320; GFX10-NEXT:    s_xor_b32 s0, s0, 0x8000
17321; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
17322; GFX10-NEXT:    ; return to shader part epilog
17323;
17324; GFX11-LABEL: s_fneg_bf16:
17325; GFX11:       ; %bb.0:
17326; GFX11-NEXT:    s_xor_b32 s0, s0, 0x8000
17327; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17328; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
17329; GFX11-NEXT:    ; return to shader part epilog
17330  %op = fneg bfloat %a
17331  %cast = bitcast bfloat %op to i16
17332  %zext = zext i16 %cast to i32
17333  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17334  ret i32 %readlane
17335}
17336
17337define bfloat @v_fneg_fabs_bf16(bfloat %a) {
17338; GCN-LABEL: v_fneg_fabs_bf16:
17339; GCN:       ; %bb.0:
17340; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17341; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17342; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17343; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
17344; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17345; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
17346; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17347; GCN-NEXT:    s_setpc_b64 s[30:31]
17348;
17349; GFX7-LABEL: v_fneg_fabs_bf16:
17350; GFX7:       ; %bb.0:
17351; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17352; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17353; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17354; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
17355; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17356; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
17357; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17358; GFX7-NEXT:    s_setpc_b64 s[30:31]
17359;
17360; GFX8-LABEL: v_fneg_fabs_bf16:
17361; GFX8:       ; %bb.0:
17362; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17363; GFX8-NEXT:    v_or_b32_e32 v0, 0x8000, v0
17364; GFX8-NEXT:    s_setpc_b64 s[30:31]
17365;
17366; GFX9-LABEL: v_fneg_fabs_bf16:
17367; GFX9:       ; %bb.0:
17368; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17369; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
17370; GFX9-NEXT:    s_setpc_b64 s[30:31]
17371;
17372; GFX10-LABEL: v_fneg_fabs_bf16:
17373; GFX10:       ; %bb.0:
17374; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17375; GFX10-NEXT:    v_or_b32_e32 v0, 0x8000, v0
17376; GFX10-NEXT:    s_setpc_b64 s[30:31]
17377;
17378; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
17379; GFX11TRUE16:       ; %bb.0:
17380; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17381; GFX11TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
17382; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
17383;
17384; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
17385; GFX11FAKE16:       ; %bb.0:
17386; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17387; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
17388; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
17389  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17390  %op = fneg bfloat %fabs
17391  ret bfloat %op
17392}
17393
17394; FIXME: readfirstlane hack for other bugs
17395define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
17396; GCN-LABEL: s_fneg_fabs_bf16:
17397; GCN:       ; %bb.0:
17398; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
17399; GCN-NEXT:    v_readfirstlane_b32 s0, v0
17400; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
17401; GCN-NEXT:    s_bitset0_b32 s0, 31
17402; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
17403; GCN-NEXT:    s_xor_b32 s0, s0, 0x80000000
17404; GCN-NEXT:    s_lshr_b32 s0, s0, 16
17405; GCN-NEXT:    ; return to shader part epilog
17406;
17407; GFX7-LABEL: s_fneg_fabs_bf16:
17408; GFX7:       ; %bb.0:
17409; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
17410; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
17411; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
17412; GFX7-NEXT:    s_bitset0_b32 s0, 31
17413; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
17414; GFX7-NEXT:    s_xor_b32 s0, s0, 0x80000000
17415; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
17416; GFX7-NEXT:    ; return to shader part epilog
17417;
17418; GFX8-LABEL: s_fneg_fabs_bf16:
17419; GFX8:       ; %bb.0:
17420; GFX8-NEXT:    s_bitset1_b32 s0, 15
17421; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
17422; GFX8-NEXT:    ; return to shader part epilog
17423;
17424; GFX9-LABEL: s_fneg_fabs_bf16:
17425; GFX9:       ; %bb.0:
17426; GFX9-NEXT:    s_bitset1_b32 s0, 15
17427; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
17428; GFX9-NEXT:    ; return to shader part epilog
17429;
17430; GFX10-LABEL: s_fneg_fabs_bf16:
17431; GFX10:       ; %bb.0:
17432; GFX10-NEXT:    s_bitset1_b32 s0, 15
17433; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
17434; GFX10-NEXT:    ; return to shader part epilog
17435;
17436; GFX11-LABEL: s_fneg_fabs_bf16:
17437; GFX11:       ; %bb.0:
17438; GFX11-NEXT:    s_bitset1_b32 s0, 15
17439; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17440; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
17441; GFX11-NEXT:    ; return to shader part epilog
17442  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17443  %op = fneg bfloat %fabs
17444  %cast = bitcast bfloat %op to i16
17445  %zext = zext i16 %cast to i32
17446  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17447  ret i32 %readlane
17448}
17449
17450declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
17451declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
17452declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
17453declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
17454declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
17455declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
17456declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
17457
17458define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
17459; GCN-LABEL: v_minnum_bf16:
17460; GCN:       ; %bb.0:
17461; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17462; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17463; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17464; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17465; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17466; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
17467; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17468; GCN-NEXT:    s_setpc_b64 s[30:31]
17469;
17470; GFX7-LABEL: v_minnum_bf16:
17471; GFX7:       ; %bb.0:
17472; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17473; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17474; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17475; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17476; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17477; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
17478; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17479; GFX7-NEXT:    s_setpc_b64 s[30:31]
17480;
17481; GFX8-LABEL: v_minnum_bf16:
17482; GFX8:       ; %bb.0:
17483; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17484; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17485; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17486; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
17487; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
17488; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
17489; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
17490; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17491; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17492; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
17493; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17494; GFX8-NEXT:    s_setpc_b64 s[30:31]
17495;
17496; GFX9-LABEL: v_minnum_bf16:
17497; GFX9:       ; %bb.0:
17498; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17499; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17500; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17501; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
17502; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
17503; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
17504; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
17505; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17506; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17507; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
17508; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17509; GFX9-NEXT:    s_setpc_b64 s[30:31]
17510;
17511; GFX10-LABEL: v_minnum_bf16:
17512; GFX10:       ; %bb.0:
17513; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17514; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17515; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17516; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
17517; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
17518; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17519; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17520; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
17521; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17522; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17523; GFX10-NEXT:    s_setpc_b64 s[30:31]
17524;
17525; GFX11-LABEL: v_minnum_bf16:
17526; GFX11:       ; %bb.0:
17527; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17528; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17529; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17530; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17531; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
17532; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
17533; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
17534; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17535; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17536; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
17537; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17538; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17539; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17540; GFX11-NEXT:    s_setpc_b64 s[30:31]
17541  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
17542  ret bfloat %op
17543}
17544
17545define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
17546; GCN-LABEL: v_minnum_v2bf16:
17547; GCN:       ; %bb.0:
17548; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17549; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17550; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17551; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17552; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17553; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17554; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17555; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17556; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17557; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
17558; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
17559; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17560; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17561; GCN-NEXT:    s_setpc_b64 s[30:31]
17562;
17563; GFX7-LABEL: v_minnum_v2bf16:
17564; GFX7:       ; %bb.0:
17565; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17566; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17567; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17568; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17569; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17570; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17571; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17572; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17573; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17574; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
17575; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
17576; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17577; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17578; GFX7-NEXT:    s_setpc_b64 s[30:31]
17579;
17580; GFX8-LABEL: v_minnum_v2bf16:
17581; GFX8:       ; %bb.0:
17582; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17583; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
17584; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
17585; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
17586; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
17587; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
17588; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17589; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17590; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
17591; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
17592; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
17593; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
17594; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
17595; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
17596; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
17597; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
17598; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
17599; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17600; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
17601; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17602; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
17603; GFX8-NEXT:    s_setpc_b64 s[30:31]
17604;
17605; GFX9-LABEL: v_minnum_v2bf16:
17606; GFX9:       ; %bb.0:
17607; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17608; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
17609; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
17610; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
17611; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17612; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17613; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
17614; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
17615; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
17616; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
17617; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
17618; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
17619; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
17620; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
17621; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
17622; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
17623; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17624; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
17625; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
17626; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
17627; GFX9-NEXT:    s_setpc_b64 s[30:31]
17628;
17629; GFX10-LABEL: v_minnum_v2bf16:
17630; GFX10:       ; %bb.0:
17631; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17632; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
17633; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
17634; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17635; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17636; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
17637; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
17638; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
17639; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
17640; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
17641; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
17642; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
17643; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
17644; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
17645; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17646; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17647; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17648; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
17649; GFX10-NEXT:    s_setpc_b64 s[30:31]
17650;
17651; GFX11-LABEL: v_minnum_v2bf16:
17652; GFX11:       ; %bb.0:
17653; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17654; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
17655; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17656; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
17657; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17658; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
17659; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
17660; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
17661; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17662; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
17663; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
17664; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
17665; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
17666; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
17667; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
17668; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
17669; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
17670; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17671; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17672; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17673; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17674; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
17675; GFX11-NEXT:    s_setpc_b64 s[30:31]
17676  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
17677  ret <2 x bfloat> %op
17678}
17679
17680define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
17681; GCN-LABEL: v_minnum_v3bf16:
17682; GCN:       ; %bb.0:
17683; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17684; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17685; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17686; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17687; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17688; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17689; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17690; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
17691; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17692; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
17693; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17694; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17695; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17696; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
17697; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
17698; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
17699; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17700; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17701; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17702; GCN-NEXT:    s_setpc_b64 s[30:31]
17703;
17704; GFX7-LABEL: v_minnum_v3bf16:
17705; GFX7:       ; %bb.0:
17706; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17707; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17708; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17709; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17710; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17711; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17712; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17713; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
17714; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17715; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
17716; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17717; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17718; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17719; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
17720; GFX7-NEXT:    v_min_f32_e32 v1, v1, v4
17721; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
17722; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17723; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17724; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17725; GFX7-NEXT:    s_setpc_b64 s[30:31]
17726;
17727; GFX8-LABEL: v_minnum_v3bf16:
17728; GFX8:       ; %bb.0:
17729; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17730; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
17731; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17732; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
17733; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
17734; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
17735; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
17736; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
17737; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
17738; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
17739; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
17740; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
17741; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
17742; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
17743; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
17744; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
17745; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17746; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17747; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
17748; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
17749; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
17750; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
17751; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
17752; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
17753; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
17754; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
17755; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
17756; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17757; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
17758; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17759; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
17760; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
17761; GFX8-NEXT:    s_setpc_b64 s[30:31]
17762;
17763; GFX9-LABEL: v_minnum_v3bf16:
17764; GFX9:       ; %bb.0:
17765; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17766; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
17767; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17768; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
17769; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
17770; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
17771; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
17772; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
17773; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
17774; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
17775; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
17776; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
17777; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
17778; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17779; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17780; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
17781; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
17782; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
17783; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
17784; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
17785; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
17786; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
17787; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
17788; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
17789; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17790; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
17791; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
17792; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
17793; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
17794; GFX9-NEXT:    s_setpc_b64 s[30:31]
17795;
17796; GFX10-LABEL: v_minnum_v3bf16:
17797; GFX10:       ; %bb.0:
17798; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17799; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17800; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
17801; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17802; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17803; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
17804; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
17805; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
17806; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
17807; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
17808; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
17809; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
17810; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
17811; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
17812; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
17813; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
17814; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17815; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
17816; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
17817; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
17818; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17819; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17820; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17821; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
17822; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
17823; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17824; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
17825; GFX10-NEXT:    s_setpc_b64 s[30:31]
17826;
17827; GFX11TRUE16-LABEL: v_minnum_v3bf16:
17828; GFX11TRUE16:       ; %bb.0:
17829; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17830; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17831; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
17832; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
17833; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17834; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17835; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17836; GFX11TRUE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17837; GFX11TRUE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17838; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17839; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
17840; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
17841; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
17842; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
17843; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
17844; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
17845; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17846; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
17847; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
17848; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
17849; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17850; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17851; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17852; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17853; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
17854; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
17855; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17856; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17857; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
17858; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
17859;
17860; GFX11FAKE16-LABEL: v_minnum_v3bf16:
17861; GFX11FAKE16:       ; %bb.0:
17862; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17863; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17864; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
17865; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
17866; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17867; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17868; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17869; GFX11FAKE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17870; GFX11FAKE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17871; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17872; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
17873; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
17874; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
17875; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
17876; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
17877; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
17878; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17879; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
17880; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
17881; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
17882; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17883; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
17884; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17885; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17886; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
17887; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
17888; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17889; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17890; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
17891; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
17892  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
17893  ret <3 x bfloat> %op
17894}
17895
17896define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
17897; GCN-LABEL: v_minnum_v4bf16:
17898; GCN:       ; %bb.0:
17899; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17900; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17901; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17902; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17903; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17904; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17905; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
17906; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17907; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
17908; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
17909; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17910; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
17911; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17912; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
17913; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17914; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
17915; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17916; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
17917; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
17918; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
17919; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
17920; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17921; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17922; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17923; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17924; GCN-NEXT:    s_setpc_b64 s[30:31]
17925;
17926; GFX7-LABEL: v_minnum_v4bf16:
17927; GFX7:       ; %bb.0:
17928; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17929; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17930; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17931; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17932; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17933; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17934; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
17935; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17936; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
17937; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
17938; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17939; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
17940; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17941; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
17942; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17943; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
17944; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17945; GFX7-NEXT:    v_min_f32_e32 v3, v3, v7
17946; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
17947; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
17948; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
17949; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17950; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17951; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17952; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17953; GFX7-NEXT:    s_setpc_b64 s[30:31]
17954;
17955; GFX8-LABEL: v_minnum_v4bf16:
17956; GFX8:       ; %bb.0:
17957; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17958; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
17959; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
17960; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
17961; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
17962; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
17963; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17964; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
17965; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
17966; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
17967; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
17968; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
17969; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
17970; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
17971; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
17972; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
17973; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
17974; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
17975; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
17976; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
17977; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
17978; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
17979; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
17980; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
17981; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
17982; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17983; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
17984; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
17985; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
17986; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
17987; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
17988; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
17989; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
17990; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
17991; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
17992; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
17993; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
17994; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
17995; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
17996; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
17997; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
17998; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
17999; GFX8-NEXT:    s_setpc_b64 s[30:31]
18000;
18001; GFX9-LABEL: v_minnum_v4bf16:
18002; GFX9:       ; %bb.0:
18003; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18004; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
18005; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
18006; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
18007; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18008; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18009; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
18010; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
18011; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
18012; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
18013; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
18014; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
18015; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
18016; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
18017; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
18018; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
18019; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
18020; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
18021; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
18022; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
18023; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
18024; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18025; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18026; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
18027; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
18028; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
18029; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
18030; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
18031; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
18032; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
18033; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
18034; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
18035; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
18036; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
18037; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
18038; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
18039; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
18040; GFX9-NEXT:    s_setpc_b64 s[30:31]
18041;
18042; GFX10-LABEL: v_minnum_v4bf16:
18043; GFX10:       ; %bb.0:
18044; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18045; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
18046; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
18047; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18048; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18049; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
18050; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
18051; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
18052; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18053; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18054; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
18055; GFX10-NEXT:    v_min_f32_e32 v3, v7, v6
18056; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
18057; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
18058; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
18059; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
18060; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
18061; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
18062; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
18063; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
18064; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
18065; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
18066; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18067; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
18068; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
18069; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
18070; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
18071; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
18072; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18073; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
18074; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18075; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
18076; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
18077; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18078; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
18079; GFX10-NEXT:    s_setpc_b64 s[30:31]
18080;
18081; GFX11-LABEL: v_minnum_v4bf16:
18082; GFX11:       ; %bb.0:
18083; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18084; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
18085; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
18086; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18087; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18088; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
18089; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
18090; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18091; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
18092; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18093; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
18094; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18095; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
18096; GFX11-NEXT:    v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
18097; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
18098; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
18099; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
18100; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
18101; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
18102; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
18103; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
18104; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
18105; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
18106; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
18107; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
18108; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18109; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
18110; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
18111; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
18112; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
18113; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18114; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
18115; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18116; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18117; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
18118; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
18119; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18120; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
18121; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
18122; GFX11-NEXT:    s_setpc_b64 s[30:31]
18123  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
18124  ret <4 x bfloat> %op
18125}
18126
18127define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
18128; GCN-LABEL: v_minnum_v8bf16:
18129; GCN:       ; %bb.0:
18130; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18131; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
18132; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
18133; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
18134; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
18135; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18136; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
18137; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18138; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
18139; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18140; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
18141; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18142; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
18143; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
18144; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
18145; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
18146; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
18147; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18148; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18149; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18150; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18151; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18152; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18153; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18154; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18155; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18156; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18157; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18158; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18159; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18160; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18161; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18162; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18163; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
18164; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
18165; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
18166; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
18167; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
18168; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
18169; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
18170; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
18171; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18172; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18173; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18174; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18175; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18176; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18177; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18178; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18179; GCN-NEXT:    s_setpc_b64 s[30:31]
18180;
18181; GFX7-LABEL: v_minnum_v8bf16:
18182; GFX7:       ; %bb.0:
18183; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18184; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
18185; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
18186; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
18187; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
18188; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18189; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
18190; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18191; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
18192; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18193; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
18194; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18195; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
18196; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
18197; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
18198; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
18199; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
18200; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18201; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18202; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18203; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18204; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18205; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18206; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18207; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18208; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18209; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18210; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18211; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18212; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18213; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18214; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18215; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18216; GFX7-NEXT:    v_min_f32_e32 v7, v7, v15
18217; GFX7-NEXT:    v_min_f32_e32 v6, v6, v14
18218; GFX7-NEXT:    v_min_f32_e32 v5, v5, v13
18219; GFX7-NEXT:    v_min_f32_e32 v4, v4, v12
18220; GFX7-NEXT:    v_min_f32_e32 v3, v3, v11
18221; GFX7-NEXT:    v_min_f32_e32 v2, v2, v10
18222; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
18223; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
18224; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18225; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18226; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18227; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18228; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18229; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18230; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18231; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18232; GFX7-NEXT:    s_setpc_b64 s[30:31]
18233;
18234; GFX8-LABEL: v_minnum_v8bf16:
18235; GFX8:       ; %bb.0:
18236; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18237; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
18238; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
18239; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
18240; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
18241; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
18242; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18243; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18244; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
18245; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
18246; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
18247; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
18248; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
18249; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
18250; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
18251; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
18252; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
18253; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
18254; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
18255; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
18256; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
18257; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
18258; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
18259; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
18260; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
18261; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18262; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18263; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
18264; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
18265; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
18266; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
18267; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
18268; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
18269; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
18270; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
18271; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
18272; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
18273; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
18274; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
18275; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
18276; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
18277; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
18278; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
18279; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18280; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18281; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
18282; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
18283; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18284; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18285; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
18286; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18287; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
18288; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
18289; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
18290; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
18291; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
18292; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
18293; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
18294; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
18295; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
18296; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
18297; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18298; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18299; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
18300; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
18301; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18302; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18303; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
18304; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18305; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
18306; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
18307; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
18308; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
18309; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
18310; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
18311; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
18312; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
18313; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
18314; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
18315; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
18316; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
18317; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
18318; GFX8-NEXT:    s_setpc_b64 s[30:31]
18319;
18320; GFX9-LABEL: v_minnum_v8bf16:
18321; GFX9:       ; %bb.0:
18322; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18323; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
18324; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
18325; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
18326; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18327; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18328; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
18329; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
18330; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
18331; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
18332; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
18333; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
18334; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
18335; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
18336; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
18337; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
18338; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
18339; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
18340; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
18341; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
18342; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
18343; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18344; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18345; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
18346; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
18347; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
18348; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
18349; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
18350; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
18351; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
18352; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
18353; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
18354; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
18355; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
18356; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
18357; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
18358; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
18359; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18360; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18361; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
18362; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
18363; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
18364; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18365; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18366; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
18367; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18368; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
18369; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
18370; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
18371; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
18372; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
18373; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
18374; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
18375; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18376; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18377; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
18378; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
18379; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
18380; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18381; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18382; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
18383; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18384; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
18385; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
18386; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
18387; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
18388; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
18389; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
18390; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
18391; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
18392; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
18393; GFX9-NEXT:    s_setpc_b64 s[30:31]
18394;
18395; GFX10-LABEL: v_minnum_v8bf16:
18396; GFX10:       ; %bb.0:
18397; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18398; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
18399; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
18400; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18401; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18402; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
18403; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18404; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
18405; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
18406; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18407; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
18408; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
18409; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
18410; GFX10-NEXT:    v_min_f32_e32 v7, v10, v9
18411; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
18412; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
18413; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
18414; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
18415; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
18416; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
18417; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
18418; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
18419; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18420; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
18421; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
18422; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
18423; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
18424; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
18425; GFX10-NEXT:    v_min_f32_e32 v6, v10, v6
18426; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
18427; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18428; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18429; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
18430; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
18431; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
18432; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
18433; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18434; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18435; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
18436; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
18437; GFX10-NEXT:    v_min_f32_e32 v5, v15, v13
18438; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
18439; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
18440; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18441; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
18442; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18443; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
18444; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
18445; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
18446; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
18447; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
18448; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
18449; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
18450; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
18451; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
18452; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18453; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
18454; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
18455; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
18456; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18457; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
18458; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18459; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
18460; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
18461; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18462; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
18463; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
18464; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18465; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
18466; GFX10-NEXT:    s_setpc_b64 s[30:31]
18467;
18468; GFX11-LABEL: v_minnum_v8bf16:
18469; GFX11:       ; %bb.0:
18470; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18471; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
18472; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
18473; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18474; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
18475; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
18476; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18477; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
18478; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
18479; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
18480; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18481; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
18482; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18483; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
18484; GFX11-NEXT:    v_min_f32_e32 v7, v10, v9
18485; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
18486; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
18487; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18488; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
18489; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
18490; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
18491; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
18492; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18493; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
18494; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
18495; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
18496; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
18497; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18498; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
18499; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18500; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
18501; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
18502; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
18503; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18504; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
18505; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18506; GFX11-NEXT:    v_min_f32_e32 v6, v10, v6
18507; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
18508; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
18509; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
18510; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
18511; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
18512; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18513; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18514; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
18515; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
18516; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18517; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18518; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
18519; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
18520; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
18521; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
18522; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
18523; GFX11-NEXT:    v_min_f32_e32 v5, v15, v13
18524; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18525; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
18526; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
18527; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
18528; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
18529; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18530; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
18531; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
18532; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18533; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
18534; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
18535; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
18536; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
18537; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18538; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
18539; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18540; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
18541; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
18542; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
18543; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18544; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
18545; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
18546; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18547; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
18548; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
18549; GFX11-NEXT:    s_setpc_b64 s[30:31]
18550  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
18551  ret <8 x bfloat> %op
18552}
18553
18554define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
18555; GCN-LABEL: v_minnum_v16bf16:
18556; GCN:       ; %bb.0:
18557; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18558; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
18559; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
18560; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
18561; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18562; GCN-NEXT:    v_min_f32_e32 v14, v14, v30
18563; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
18564; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
18565; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
18566; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18567; GCN-NEXT:    v_min_f32_e32 v13, v13, v29
18568; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
18569; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
18570; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
18571; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18572; GCN-NEXT:    v_min_f32_e32 v12, v12, v28
18573; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
18574; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
18575; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
18576; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
18577; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
18578; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
18579; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
18580; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
18581; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
18582; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
18583; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
18584; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
18585; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18586; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
18587; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18588; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
18589; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18590; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
18591; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18592; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
18593; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
18594; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
18595; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
18596; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
18597; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
18598; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
18599; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18600; GCN-NEXT:    v_min_f32_e32 v11, v11, v27
18601; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32
18602; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
18603; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18604; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
18605; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18606; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
18607; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18608; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
18609; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18610; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
18611; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18612; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
18613; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18614; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
18615; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18616; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
18617; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18618; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
18619; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18620; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
18621; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18622; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
18623; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18624; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18625; GCN-NEXT:    v_min_f32_e32 v10, v10, v26
18626; GCN-NEXT:    v_min_f32_e32 v9, v9, v25
18627; GCN-NEXT:    v_min_f32_e32 v8, v8, v24
18628; GCN-NEXT:    v_min_f32_e32 v7, v7, v23
18629; GCN-NEXT:    v_min_f32_e32 v6, v6, v22
18630; GCN-NEXT:    v_min_f32_e32 v5, v5, v21
18631; GCN-NEXT:    v_min_f32_e32 v4, v4, v20
18632; GCN-NEXT:    v_min_f32_e32 v3, v3, v19
18633; GCN-NEXT:    v_min_f32_e32 v2, v2, v18
18634; GCN-NEXT:    v_min_f32_e32 v1, v1, v17
18635; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
18636; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18637; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18638; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18639; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18640; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18641; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18642; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18643; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18644; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18645; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18646; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18647; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18648; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18649; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18650; GCN-NEXT:    s_waitcnt vmcnt(0)
18651; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
18652; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
18653; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
18654; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18655; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18656; GCN-NEXT:    s_setpc_b64 s[30:31]
18657;
18658; GFX7-LABEL: v_minnum_v16bf16:
18659; GFX7:       ; %bb.0:
18660; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18661; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
18662; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
18663; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
18664; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18665; GFX7-NEXT:    v_min_f32_e32 v11, v11, v27
18666; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32
18667; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
18668; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
18669; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
18670; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18671; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
18672; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
18673; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
18674; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
18675; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
18676; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
18677; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
18678; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
18679; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
18680; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
18681; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
18682; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
18683; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
18684; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
18685; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
18686; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18687; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
18688; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
18689; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18690; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
18691; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18692; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
18693; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18694; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
18695; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
18696; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
18697; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
18698; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
18699; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
18700; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18701; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
18702; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18703; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
18704; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18705; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
18706; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18707; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
18708; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18709; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
18710; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18711; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
18712; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18713; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18714; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
18715; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18716; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
18717; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18718; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
18719; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18720; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
18721; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18722; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
18723; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18724; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
18725; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18726; GFX7-NEXT:    v_min_f32_e32 v14, v14, v30
18727; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
18728; GFX7-NEXT:    v_min_f32_e32 v12, v12, v28
18729; GFX7-NEXT:    v_min_f32_e32 v10, v10, v26
18730; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
18731; GFX7-NEXT:    v_min_f32_e32 v8, v8, v24
18732; GFX7-NEXT:    v_min_f32_e32 v7, v7, v23
18733; GFX7-NEXT:    v_min_f32_e32 v5, v5, v21
18734; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
18735; GFX7-NEXT:    v_min_f32_e32 v3, v3, v19
18736; GFX7-NEXT:    v_min_f32_e32 v2, v2, v18
18737; GFX7-NEXT:    v_min_f32_e32 v1, v1, v17
18738; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
18739; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18740; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18741; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18742; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18743; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18744; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18745; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18746; GFX7-NEXT:    s_waitcnt vmcnt(0)
18747; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v27
18748; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
18749; GFX7-NEXT:    v_min_f32_e32 v15, v15, v22
18750; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18751; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18752; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18753; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18754; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18755; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18756; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18757; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18758; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18759; GFX7-NEXT:    s_setpc_b64 s[30:31]
18760;
18761; GFX8-LABEL: v_minnum_v16bf16:
18762; GFX8:       ; %bb.0:
18763; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18764; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
18765; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
18766; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
18767; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
18768; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
18769; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
18770; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18771; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18772; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18773; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
18774; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
18775; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
18776; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
18777; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
18778; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
18779; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
18780; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
18781; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
18782; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
18783; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
18784; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
18785; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
18786; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
18787; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
18788; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18789; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18790; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18791; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
18792; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
18793; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
18794; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
18795; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
18796; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
18797; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
18798; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
18799; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18800; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
18801; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
18802; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
18803; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
18804; GFX8-NEXT:    v_bfe_u32 v17, v14, 16, 1
18805; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v14
18806; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18807; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18808; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18809; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
18810; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
18811; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
18812; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
18813; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
18814; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
18815; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
18816; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
18817; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18818; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
18819; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
18820; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
18821; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
18822; GFX8-NEXT:    v_bfe_u32 v17, v13, 16, 1
18823; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
18824; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18825; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18826; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18827; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
18828; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
18829; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
18830; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
18831; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
18832; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
18833; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
18834; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
18835; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
18836; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
18837; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
18838; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
18839; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
18840; GFX8-NEXT:    v_bfe_u32 v17, v12, 16, 1
18841; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v12
18842; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18843; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18844; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18845; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
18846; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
18847; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
18848; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
18849; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
18850; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
18851; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
18852; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
18853; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
18854; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
18855; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
18856; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
18857; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
18858; GFX8-NEXT:    v_bfe_u32 v17, v11, 16, 1
18859; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
18860; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
18861; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18862; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18863; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
18864; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
18865; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
18866; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
18867; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
18868; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
18869; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
18870; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
18871; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
18872; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
18873; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
18874; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
18875; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
18876; GFX8-NEXT:    v_bfe_u32 v17, v10, 16, 1
18877; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v10
18878; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
18879; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
18880; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18881; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
18882; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
18883; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
18884; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
18885; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
18886; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
18887; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
18888; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
18889; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
18890; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
18891; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
18892; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
18893; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
18894; GFX8-NEXT:    v_bfe_u32 v17, v9, 16, 1
18895; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v9
18896; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
18897; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
18898; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
18899; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
18900; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
18901; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
18902; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
18903; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
18904; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
18905; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
18906; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
18907; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
18908; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
18909; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
18910; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
18911; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
18912; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
18913; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
18914; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
18915; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
18916; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
18917; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
18918; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
18919; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
18920; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
18921; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
18922; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
18923; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
18924; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
18925; GFX8-NEXT:    s_setpc_b64 s[30:31]
18926;
18927; GFX9-LABEL: v_minnum_v16bf16:
18928; GFX9:       ; %bb.0:
18929; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18930; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
18931; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
18932; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
18933; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
18934; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
18935; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
18936; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
18937; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
18938; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
18939; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
18940; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
18941; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
18942; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
18943; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
18944; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
18945; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
18946; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
18947; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
18948; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
18949; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
18950; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
18951; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
18952; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
18953; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
18954; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
18955; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
18956; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
18957; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
18958; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
18959; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
18960; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
18961; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18962; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
18963; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
18964; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
18965; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
18966; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
18967; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
18968; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
18969; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
18970; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
18971; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
18972; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
18973; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
18974; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
18975; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
18976; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
18977; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18978; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
18979; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
18980; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
18981; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
18982; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
18983; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
18984; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
18985; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
18986; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
18987; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
18988; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
18989; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
18990; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
18991; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
18992; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
18993; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
18994; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
18995; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
18996; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
18997; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
18998; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
18999; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19000; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
19001; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
19002; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
19003; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
19004; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
19005; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
19006; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
19007; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
19008; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
19009; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
19010; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
19011; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
19012; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
19013; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
19014; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19015; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19016; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
19017; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
19018; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
19019; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
19020; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
19021; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
19022; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
19023; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
19024; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
19025; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
19026; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
19027; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
19028; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
19029; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
19030; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19031; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19032; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
19033; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
19034; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
19035; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
19036; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
19037; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
19038; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
19039; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
19040; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
19041; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
19042; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
19043; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
19044; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
19045; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
19046; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19047; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19048; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
19049; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
19050; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
19051; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
19052; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
19053; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
19054; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
19055; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
19056; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
19057; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
19058; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
19059; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
19060; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
19061; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
19062; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
19063; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
19064; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
19065; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
19066; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
19067; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
19068; GFX9-NEXT:    s_setpc_b64 s[30:31]
19069;
19070; GFX10-LABEL: v_minnum_v16bf16:
19071; GFX10:       ; %bb.0:
19072; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19073; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
19074; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
19075; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19076; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19077; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
19078; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19079; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
19080; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
19081; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
19082; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19083; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
19084; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
19085; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
19086; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
19087; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
19088; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
19089; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
19090; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
19091; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
19092; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
19093; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
19094; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
19095; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
19096; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
19097; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
19098; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19099; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19100; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
19101; GFX10-NEXT:    v_bfe_u32 v18, v6, 16, 1
19102; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19103; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
19104; GFX10-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
19105; GFX10-NEXT:    v_min_f32_e32 v17, v20, v19
19106; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
19107; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
19108; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
19109; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
19110; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
19111; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
19112; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
19113; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19114; GFX10-NEXT:    v_bfe_u32 v21, v5, 16, 1
19115; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19116; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19117; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
19118; GFX10-NEXT:    v_min_f32_e32 v13, v19, v18
19119; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
19120; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
19121; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
19122; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
19123; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
19124; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
19125; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
19126; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19127; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
19128; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
19129; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
19130; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
19131; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19132; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19133; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
19134; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19135; GFX10-NEXT:    v_min_f32_e32 v12, v18, v12
19136; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
19137; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
19138; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
19139; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
19140; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
19141; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19142; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
19143; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
19144; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
19145; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
19146; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19147; GFX10-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
19148; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
19149; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19150; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
19151; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
19152; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
19153; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
19154; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
19155; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19156; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
19157; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
19158; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
19159; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
19160; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19161; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19162; GFX10-NEXT:    v_bfe_u32 v19, v2, 16, 1
19163; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
19164; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
19165; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19166; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
19167; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
19168; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
19169; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19170; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
19171; GFX10-NEXT:    v_min_f32_e32 v19, v22, v20
19172; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
19173; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
19174; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19175; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19176; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
19177; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
19178; GFX10-NEXT:    v_min_f32_e32 v9, v22, v20
19179; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
19180; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
19181; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
19182; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
19183; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
19184; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
19185; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
19186; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
19187; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
19188; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19189; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
19190; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
19191; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
19192; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
19193; GFX10-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
19194; GFX10-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
19195; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19196; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
19197; GFX10-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
19198; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
19199; GFX10-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19200; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
19201; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19202; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
19203; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
19204; GFX10-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19205; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
19206; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
19207; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19208; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
19209; GFX10-NEXT:    s_setpc_b64 s[30:31]
19210;
19211; GFX11-LABEL: v_minnum_v16bf16:
19212; GFX11:       ; %bb.0:
19213; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19214; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
19215; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
19216; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19217; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
19218; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
19219; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
19220; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
19221; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19222; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
19223; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
19224; GFX11-NEXT:    v_min_f32_e32 v17, v18, v17
19225; GFX11-NEXT:    v_min_f32_e32 v6, v6, v14
19226; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
19227; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
19228; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
19229; GFX11-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
19230; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
19232; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
19233; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
19234; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
19235; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
19236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19237; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
19238; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
19239; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
19240; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
19241; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
19242; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19243; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19244; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
19245; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
19246; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
19247; GFX11-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
19248; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
19249; GFX11-NEXT:    v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
19250; GFX11-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
19251; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
19252; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
19253; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19254; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19255; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19256; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
19257; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19258; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
19259; GFX11-NEXT:    v_min_f32_e32 v4, v4, v12
19260; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
19261; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19262; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
19263; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
19264; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19265; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
19266; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
19267; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
19268; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
19269; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
19270; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
19271; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19272; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
19273; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
19274; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
19275; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
19276; GFX11-NEXT:    v_min_f32_e32 v12, v18, v12
19277; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19278; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
19279; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
19280; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
19281; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
19282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19283; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19284; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
19285; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
19286; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
19287; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
19288; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
19289; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
19290; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19291; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
19292; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
19293; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
19294; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19295; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19296; GFX11-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
19297; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
19298; GFX11-NEXT:    v_min_f32_e32 v18, v19, v18
19299; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19300; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
19301; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19302; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19303; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
19304; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
19305; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19306; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
19307; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
19308; GFX11-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
19309; GFX11-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
19310; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
19311; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
19312; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
19313; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
19314; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
19315; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
19316; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19317; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
19318; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
19319; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
19320; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
19321; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
19322; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19323; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
19324; GFX11-NEXT:    v_min_f32_e32 v19, v22, v20
19325; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
19326; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
19327; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19328; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
19329; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
19330; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19331; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19332; GFX11-NEXT:    v_bfe_u32 v23, v19, 16, 1
19333; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9
19334; GFX11-NEXT:    v_min_f32_e32 v9, v22, v20
19335; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19336; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
19337; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
19338; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
19339; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
19340; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
19341; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
19342; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19343; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
19344; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
19345; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
19346; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
19347; GFX11-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
19348; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19349; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19350; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
19351; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
19352; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19353; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
19354; GFX11-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19355; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
19356; GFX11-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19357; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
19358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19359; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
19360; GFX11-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19361; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
19362; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
19363; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
19365; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
19366; GFX11-NEXT:    s_setpc_b64 s[30:31]
19367  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
19368  ret <16 x bfloat> %op
19369}
19370
19371define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
19372; GCN-LABEL: v_minnum_v32bf16:
19373; GCN:       ; %bb.0:
19374; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19375; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
19376; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
19377; GCN-NEXT:    s_waitcnt vmcnt(1)
19378; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
19379; GCN-NEXT:    s_waitcnt vmcnt(0)
19380; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19381; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19382; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
19383; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
19384; GCN-NEXT:    v_min_f32_e32 v31, v31, v32
19385; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
19386; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19387; GCN-NEXT:    s_waitcnt vmcnt(0)
19388; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19389; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19390; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
19391; GCN-NEXT:    v_min_f32_e32 v30, v30, v32
19392; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
19393; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
19394; GCN-NEXT:    s_waitcnt vmcnt(0)
19395; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19396; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19397; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
19398; GCN-NEXT:    v_min_f32_e32 v29, v29, v32
19399; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
19400; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
19401; GCN-NEXT:    s_waitcnt vmcnt(0)
19402; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19403; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19404; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
19405; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
19406; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
19407; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
19408; GCN-NEXT:    s_waitcnt vmcnt(0)
19409; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19410; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19411; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
19412; GCN-NEXT:    v_min_f32_e32 v27, v27, v32
19413; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
19414; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
19415; GCN-NEXT:    s_waitcnt vmcnt(0)
19416; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19417; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19418; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
19419; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
19420; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
19421; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
19422; GCN-NEXT:    s_waitcnt vmcnt(0)
19423; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19424; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19425; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
19426; GCN-NEXT:    v_min_f32_e32 v25, v25, v32
19427; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
19428; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
19429; GCN-NEXT:    s_waitcnt vmcnt(0)
19430; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19431; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19432; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
19433; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
19434; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
19435; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
19436; GCN-NEXT:    s_waitcnt vmcnt(0)
19437; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19438; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19439; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
19440; GCN-NEXT:    v_min_f32_e32 v23, v23, v32
19441; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
19442; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
19443; GCN-NEXT:    s_waitcnt vmcnt(0)
19444; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19445; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19446; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
19447; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
19448; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
19449; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
19450; GCN-NEXT:    s_waitcnt vmcnt(0)
19451; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19452; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19453; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
19454; GCN-NEXT:    v_min_f32_e32 v21, v21, v32
19455; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
19456; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
19457; GCN-NEXT:    s_waitcnt vmcnt(0)
19458; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19459; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19460; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
19461; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
19462; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
19463; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
19464; GCN-NEXT:    s_waitcnt vmcnt(0)
19465; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19466; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19467; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
19468; GCN-NEXT:    v_min_f32_e32 v19, v19, v32
19469; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
19470; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
19471; GCN-NEXT:    s_waitcnt vmcnt(0)
19472; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19473; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19474; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
19475; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
19476; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
19477; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
19478; GCN-NEXT:    s_waitcnt vmcnt(0)
19479; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19480; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19481; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
19482; GCN-NEXT:    v_min_f32_e32 v17, v17, v32
19483; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
19484; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
19485; GCN-NEXT:    s_waitcnt vmcnt(0)
19486; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19487; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19488; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
19489; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
19490; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
19491; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19492; GCN-NEXT:    s_waitcnt vmcnt(0)
19493; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19494; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19495; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
19496; GCN-NEXT:    v_min_f32_e32 v15, v15, v32
19497; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
19498; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19499; GCN-NEXT:    s_waitcnt vmcnt(0)
19500; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19501; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19502; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
19503; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
19504; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
19505; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19506; GCN-NEXT:    s_waitcnt vmcnt(0)
19507; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19508; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19509; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
19510; GCN-NEXT:    v_min_f32_e32 v13, v13, v32
19511; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
19512; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19513; GCN-NEXT:    s_waitcnt vmcnt(0)
19514; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19515; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19516; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
19517; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
19518; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
19519; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19520; GCN-NEXT:    s_waitcnt vmcnt(0)
19521; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19522; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19523; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
19524; GCN-NEXT:    v_min_f32_e32 v11, v11, v32
19525; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
19526; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19527; GCN-NEXT:    s_waitcnt vmcnt(0)
19528; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19529; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19530; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
19531; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
19532; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
19533; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19534; GCN-NEXT:    s_waitcnt vmcnt(0)
19535; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19536; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19537; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
19538; GCN-NEXT:    v_min_f32_e32 v9, v9, v32
19539; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
19540; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19541; GCN-NEXT:    s_waitcnt vmcnt(0)
19542; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19543; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19544; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
19545; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
19546; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
19547; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19548; GCN-NEXT:    s_waitcnt vmcnt(0)
19549; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19550; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19551; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
19552; GCN-NEXT:    v_min_f32_e32 v7, v7, v32
19553; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
19554; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19555; GCN-NEXT:    s_waitcnt vmcnt(0)
19556; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19557; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19558; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
19559; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
19560; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
19561; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19562; GCN-NEXT:    s_waitcnt vmcnt(0)
19563; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19564; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19565; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
19566; GCN-NEXT:    v_min_f32_e32 v5, v5, v32
19567; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
19568; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19569; GCN-NEXT:    s_waitcnt vmcnt(0)
19570; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19571; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19572; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
19573; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
19574; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
19575; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19576; GCN-NEXT:    s_waitcnt vmcnt(0)
19577; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19578; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19579; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
19580; GCN-NEXT:    v_min_f32_e32 v3, v3, v32
19581; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19582; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19583; GCN-NEXT:    s_waitcnt vmcnt(0)
19584; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19585; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19586; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
19587; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
19588; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
19589; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19590; GCN-NEXT:    s_waitcnt vmcnt(0)
19591; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19592; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19593; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
19594; GCN-NEXT:    v_min_f32_e32 v1, v1, v32
19595; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
19596; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19597; GCN-NEXT:    s_waitcnt vmcnt(0)
19598; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
19599; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19600; GCN-NEXT:    v_min_f32_e32 v0, v0, v32
19601; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19602; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19603; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19604; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19605; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19606; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19607; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19608; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19609; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19610; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19611; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19612; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19613; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19614; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19615; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19616; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19617; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
19618; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
19619; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
19620; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
19621; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
19622; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
19623; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
19624; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
19625; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
19626; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
19627; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
19628; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
19629; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
19630; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
19631; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19632; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
19633; GCN-NEXT:    s_setpc_b64 s[30:31]
19634;
19635; GFX7-LABEL: v_minnum_v32bf16:
19636; GFX7:       ; %bb.0:
19637; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19638; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
19639; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
19640; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
19641; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19642; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
19643; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
19644; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
19645; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
19646; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
19647; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
19648; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
19649; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
19650; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
19651; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
19652; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
19653; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
19654; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
19655; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
19656; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
19657; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
19658; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
19659; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
19660; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
19661; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
19662; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
19663; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
19664; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
19665; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
19666; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
19667; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
19668; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
19669; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
19670; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
19671; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19672; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
19673; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19674; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
19675; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19676; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
19677; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19678; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
19679; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19680; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
19681; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19682; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
19683; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19684; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
19685; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19686; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
19687; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19688; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
19689; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19690; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
19691; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19692; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
19693; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19694; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
19695; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19696; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19697; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19698; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
19699; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19700; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
19701; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19702; GFX7-NEXT:    s_waitcnt vmcnt(1)
19703; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
19704; GFX7-NEXT:    s_waitcnt vmcnt(0)
19705; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19706; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19707; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
19708; GFX7-NEXT:    v_min_f32_e32 v31, v31, v32
19709; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
19710; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
19711; GFX7-NEXT:    s_waitcnt vmcnt(0)
19712; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19713; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19714; GFX7-NEXT:    v_min_f32_e32 v30, v30, v32
19715; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
19716; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19717; GFX7-NEXT:    s_waitcnt vmcnt(0)
19718; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19719; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19720; GFX7-NEXT:    v_min_f32_e32 v29, v29, v32
19721; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
19722; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
19723; GFX7-NEXT:    s_waitcnt vmcnt(0)
19724; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19725; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19726; GFX7-NEXT:    v_min_f32_e32 v28, v28, v32
19727; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
19728; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
19729; GFX7-NEXT:    s_waitcnt vmcnt(0)
19730; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19731; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19732; GFX7-NEXT:    v_min_f32_e32 v27, v27, v32
19733; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
19734; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
19735; GFX7-NEXT:    s_waitcnt vmcnt(0)
19736; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19737; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19738; GFX7-NEXT:    v_min_f32_e32 v26, v26, v32
19739; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
19740; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
19741; GFX7-NEXT:    s_waitcnt vmcnt(0)
19742; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19743; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19744; GFX7-NEXT:    v_min_f32_e32 v25, v25, v32
19745; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
19746; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
19747; GFX7-NEXT:    s_waitcnt vmcnt(0)
19748; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19749; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19750; GFX7-NEXT:    v_min_f32_e32 v24, v24, v32
19751; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
19752; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
19753; GFX7-NEXT:    s_waitcnt vmcnt(0)
19754; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19755; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19756; GFX7-NEXT:    v_min_f32_e32 v23, v23, v32
19757; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
19758; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
19759; GFX7-NEXT:    s_waitcnt vmcnt(0)
19760; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19761; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19762; GFX7-NEXT:    v_min_f32_e32 v22, v22, v32
19763; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
19764; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
19765; GFX7-NEXT:    s_waitcnt vmcnt(0)
19766; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19767; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19768; GFX7-NEXT:    v_min_f32_e32 v21, v21, v32
19769; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
19770; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
19771; GFX7-NEXT:    s_waitcnt vmcnt(0)
19772; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19773; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19774; GFX7-NEXT:    v_min_f32_e32 v20, v20, v32
19775; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
19776; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
19777; GFX7-NEXT:    s_waitcnt vmcnt(0)
19778; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19779; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19780; GFX7-NEXT:    v_min_f32_e32 v19, v19, v32
19781; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
19782; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
19783; GFX7-NEXT:    s_waitcnt vmcnt(0)
19784; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19785; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19786; GFX7-NEXT:    v_min_f32_e32 v18, v18, v32
19787; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
19788; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
19789; GFX7-NEXT:    s_waitcnt vmcnt(0)
19790; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19791; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19792; GFX7-NEXT:    v_min_f32_e32 v17, v17, v32
19793; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
19794; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
19795; GFX7-NEXT:    s_waitcnt vmcnt(0)
19796; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19797; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19798; GFX7-NEXT:    v_min_f32_e32 v16, v16, v32
19799; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
19800; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
19801; GFX7-NEXT:    s_waitcnt vmcnt(0)
19802; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19803; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19804; GFX7-NEXT:    v_min_f32_e32 v15, v15, v32
19805; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
19806; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19807; GFX7-NEXT:    s_waitcnt vmcnt(0)
19808; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19809; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19810; GFX7-NEXT:    v_min_f32_e32 v14, v14, v32
19811; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
19812; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19813; GFX7-NEXT:    s_waitcnt vmcnt(0)
19814; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19815; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19816; GFX7-NEXT:    v_min_f32_e32 v13, v13, v32
19817; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
19818; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19819; GFX7-NEXT:    s_waitcnt vmcnt(0)
19820; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19821; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19822; GFX7-NEXT:    v_min_f32_e32 v12, v12, v32
19823; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
19824; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19825; GFX7-NEXT:    s_waitcnt vmcnt(0)
19826; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19827; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19828; GFX7-NEXT:    v_min_f32_e32 v11, v11, v32
19829; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
19830; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19831; GFX7-NEXT:    s_waitcnt vmcnt(0)
19832; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19833; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19834; GFX7-NEXT:    v_min_f32_e32 v10, v10, v32
19835; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
19836; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
19837; GFX7-NEXT:    s_waitcnt vmcnt(0)
19838; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19839; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19840; GFX7-NEXT:    v_min_f32_e32 v9, v9, v32
19841; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
19842; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
19843; GFX7-NEXT:    s_waitcnt vmcnt(0)
19844; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19845; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19846; GFX7-NEXT:    v_min_f32_e32 v8, v8, v32
19847; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
19848; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
19849; GFX7-NEXT:    s_waitcnt vmcnt(0)
19850; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19851; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19852; GFX7-NEXT:    v_min_f32_e32 v7, v7, v32
19853; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
19854; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
19855; GFX7-NEXT:    s_waitcnt vmcnt(0)
19856; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19857; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19858; GFX7-NEXT:    v_min_f32_e32 v6, v6, v32
19859; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
19860; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
19861; GFX7-NEXT:    s_waitcnt vmcnt(0)
19862; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19863; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19864; GFX7-NEXT:    v_min_f32_e32 v5, v5, v32
19865; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
19866; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
19867; GFX7-NEXT:    s_waitcnt vmcnt(0)
19868; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19869; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19870; GFX7-NEXT:    v_min_f32_e32 v4, v4, v32
19871; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
19872; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
19873; GFX7-NEXT:    s_waitcnt vmcnt(0)
19874; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19875; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19876; GFX7-NEXT:    v_min_f32_e32 v3, v3, v32
19877; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
19878; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19879; GFX7-NEXT:    s_waitcnt vmcnt(0)
19880; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19881; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19882; GFX7-NEXT:    v_min_f32_e32 v2, v2, v32
19883; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
19884; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19885; GFX7-NEXT:    s_waitcnt vmcnt(0)
19886; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19887; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19888; GFX7-NEXT:    v_min_f32_e32 v1, v1, v32
19889; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
19890; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
19891; GFX7-NEXT:    s_waitcnt vmcnt(0)
19892; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
19893; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
19894; GFX7-NEXT:    v_min_f32_e32 v0, v0, v32
19895; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
19896; GFX7-NEXT:    s_setpc_b64 s[30:31]
19897;
19898; GFX8-LABEL: v_minnum_v32bf16:
19899; GFX8:       ; %bb.0:
19900; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19901; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
19902; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
19903; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
19904; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
19905; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
19906; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
19907; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19908; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
19909; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
19910; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
19911; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
19912; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
19913; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
19914; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
19915; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
19916; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
19917; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
19918; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
19919; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
19920; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
19921; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
19922; GFX8-NEXT:    v_min_f32_e32 v32, v32, v30
19923; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32
19924; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
19925; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
19926; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
19927; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
19928; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
19929; GFX8-NEXT:    v_bfe_u32 v29, v13, 16, 1
19930; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
19931; GFX8-NEXT:    v_alignbit_b32 v14, v14, v31, 16
19932; GFX8-NEXT:    s_waitcnt vmcnt(0)
19933; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
19934; GFX8-NEXT:    v_min_f32_e32 v33, v33, v34
19935; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
19936; GFX8-NEXT:    v_min_f32_e32 v30, v15, v30
19937; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
19938; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
19939; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
19940; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
19941; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
19942; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
19943; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
19944; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
19945; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
19946; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
19947; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
19948; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
19949; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
19950; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
19951; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
19952; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
19953; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
19954; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
19955; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
19956; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
19957; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
19958; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
19959; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
19960; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
19961; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
19962; GFX8-NEXT:    v_min_f32_e32 v29, v33, v29
19963; GFX8-NEXT:    v_bfe_u32 v33, v29, 16, 1
19964; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
19965; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
19966; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
19967; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
19968; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
19969; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
19970; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
19971; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
19972; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
19973; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
19974; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
19975; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
19976; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
19977; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
19978; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
19979; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
19980; GFX8-NEXT:    v_min_f32_e32 v28, v33, v28
19981; GFX8-NEXT:    v_bfe_u32 v33, v28, 16, 1
19982; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
19983; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
19984; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
19985; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
19986; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
19987; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
19988; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
19989; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
19990; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
19991; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
19992; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
19993; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
19994; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
19995; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
19996; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
19997; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
19998; GFX8-NEXT:    v_min_f32_e32 v27, v33, v27
19999; GFX8-NEXT:    v_bfe_u32 v33, v27, 16, 1
20000; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
20001; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
20002; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
20003; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20004; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
20005; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
20006; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
20007; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
20008; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
20009; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
20010; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
20011; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
20012; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
20013; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
20014; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
20015; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
20016; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
20017; GFX8-NEXT:    v_bfe_u32 v33, v26, 16, 1
20018; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
20019; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
20020; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
20021; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20022; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
20023; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
20024; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
20025; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
20026; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
20027; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
20028; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
20029; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
20030; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
20031; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
20032; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
20033; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
20034; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
20035; GFX8-NEXT:    v_bfe_u32 v33, v25, 16, 1
20036; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
20037; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
20038; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
20039; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20040; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
20041; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
20042; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
20043; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
20044; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
20045; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
20046; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
20047; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
20048; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
20049; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
20050; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
20051; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
20052; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
20053; GFX8-NEXT:    v_bfe_u32 v33, v24, 16, 1
20054; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
20055; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
20056; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
20057; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20058; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
20059; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
20060; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
20061; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
20062; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
20063; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
20064; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
20065; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
20066; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
20067; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
20068; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
20069; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
20070; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
20071; GFX8-NEXT:    v_bfe_u32 v33, v23, 16, 1
20072; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
20073; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
20074; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
20075; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20076; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
20077; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
20078; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
20079; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
20080; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
20081; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
20082; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
20083; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
20084; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
20085; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
20086; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
20087; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
20088; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
20089; GFX8-NEXT:    v_bfe_u32 v33, v22, 16, 1
20090; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
20091; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
20092; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
20093; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20094; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
20095; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
20096; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
20097; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
20098; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
20099; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
20100; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
20101; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
20102; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
20103; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
20104; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
20105; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
20106; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
20107; GFX8-NEXT:    v_bfe_u32 v33, v21, 16, 1
20108; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
20109; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
20110; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
20111; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20112; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
20113; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
20114; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
20115; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
20116; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
20117; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
20118; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
20119; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
20120; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
20121; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
20122; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
20123; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
20124; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
20125; GFX8-NEXT:    v_bfe_u32 v33, v20, 16, 1
20126; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
20127; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
20128; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
20129; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20130; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
20131; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
20132; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
20133; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
20134; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
20135; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
20136; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
20137; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
20138; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
20139; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
20140; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
20141; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
20142; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
20143; GFX8-NEXT:    v_bfe_u32 v33, v19, 16, 1
20144; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
20145; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
20146; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
20147; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20148; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
20149; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
20150; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
20151; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
20152; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
20153; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
20154; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
20155; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
20156; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
20157; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
20158; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
20159; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
20160; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
20161; GFX8-NEXT:    v_bfe_u32 v33, v18, 16, 1
20162; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
20163; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
20164; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
20165; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20166; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
20167; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
20168; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
20169; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
20170; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
20171; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
20172; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
20173; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
20174; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
20175; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
20176; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
20177; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
20178; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
20179; GFX8-NEXT:    v_bfe_u32 v33, v17, 16, 1
20180; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
20181; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
20182; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
20183; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
20184; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
20185; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
20186; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
20187; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
20188; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
20189; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
20190; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
20191; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
20192; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
20193; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
20194; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
20195; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
20196; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
20197; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
20198; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
20199; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
20200; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
20201; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
20202; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
20203; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
20204; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
20205; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
20206; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
20207; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
20208; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
20209; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
20210; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
20211; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
20212; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
20213; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
20214; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
20215; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
20216; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
20217; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
20218; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
20219; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
20220; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
20221; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
20222; GFX8-NEXT:    v_alignbit_b32 v13, v13, v32, 16
20223; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
20224; GFX8-NEXT:    s_setpc_b64 s[30:31]
20225;
20226; GFX9-LABEL: v_minnum_v32bf16:
20227; GFX9:       ; %bb.0:
20228; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20229; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
20230; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
20231; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
20232; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
20233; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
20234; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
20235; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
20236; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
20237; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
20238; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
20239; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
20240; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
20241; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
20242; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
20243; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
20244; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
20245; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
20246; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
20247; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
20248; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
20249; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
20250; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
20251; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
20252; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
20253; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
20254; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
20255; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
20256; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
20257; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
20258; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
20259; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
20260; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
20261; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
20262; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
20263; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
20264; GFX9-NEXT:    v_min_f32_e32 v32, v32, v29
20265; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
20266; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
20267; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
20268; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
20269; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
20270; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
20271; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
20272; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
20273; GFX9-NEXT:    s_waitcnt vmcnt(0)
20274; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
20275; GFX9-NEXT:    v_min_f32_e32 v33, v33, v34
20276; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
20277; GFX9-NEXT:    v_min_f32_e32 v29, v15, v29
20278; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
20279; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
20280; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
20281; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
20282; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
20283; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
20284; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
20285; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
20286; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
20287; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
20288; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
20289; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
20290; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
20291; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
20292; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
20293; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
20294; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
20295; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
20296; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
20297; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
20298; GFX9-NEXT:    v_min_f32_e32 v28, v33, v28
20299; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
20300; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
20301; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
20302; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
20303; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
20304; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
20305; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
20306; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
20307; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
20308; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
20309; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
20310; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
20311; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
20312; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
20313; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
20314; GFX9-NEXT:    v_min_f32_e32 v27, v33, v27
20315; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
20316; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
20317; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
20318; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
20319; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
20320; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
20321; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
20322; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
20323; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
20324; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
20325; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
20326; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
20327; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
20328; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
20329; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
20330; GFX9-NEXT:    v_min_f32_e32 v26, v33, v26
20331; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
20332; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
20333; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
20334; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
20335; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
20336; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
20337; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
20338; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
20339; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
20340; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
20341; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
20342; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
20343; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
20344; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
20345; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
20346; GFX9-NEXT:    v_min_f32_e32 v25, v33, v25
20347; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
20348; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
20349; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
20350; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
20351; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
20352; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
20353; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
20354; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
20355; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
20356; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
20357; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
20358; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
20359; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
20360; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
20361; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
20362; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
20363; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
20364; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
20365; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
20366; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
20367; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
20368; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
20369; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
20370; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
20371; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
20372; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
20373; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
20374; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
20375; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
20376; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
20377; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
20378; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
20379; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
20380; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
20381; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
20382; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
20383; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
20384; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
20385; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
20386; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
20387; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
20388; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
20389; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
20390; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
20391; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
20392; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
20393; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
20394; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
20395; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
20396; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
20397; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
20398; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
20399; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
20400; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
20401; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
20402; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
20403; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
20404; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
20405; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
20406; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
20407; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
20408; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
20409; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
20410; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
20411; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
20412; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
20413; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
20414; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
20415; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
20416; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
20417; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
20418; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
20419; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
20420; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
20421; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
20422; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
20423; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
20424; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
20425; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
20426; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
20427; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
20428; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
20429; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
20430; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
20431; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
20432; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
20433; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
20434; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
20435; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
20436; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
20437; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
20438; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
20439; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
20440; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
20441; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
20442; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
20443; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
20444; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
20445; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
20446; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
20447; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
20448; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
20449; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
20450; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
20451; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
20452; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
20453; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
20454; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
20455; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
20456; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
20457; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
20458; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
20459; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
20460; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
20461; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
20462; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
20463; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
20464; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
20465; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
20466; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
20467; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
20468; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
20469; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
20470; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
20471; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
20472; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
20473; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
20474; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
20475; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
20476; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
20477; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
20478; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
20479; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
20480; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
20481; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
20482; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
20483; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
20484; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
20485; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
20486; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
20487; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
20488; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
20489; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
20490; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
20491; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
20492; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
20493; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
20494; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
20495; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
20496; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
20497; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
20498; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
20499; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
20500; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
20501; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
20502; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
20503; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
20504; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
20505; GFX9-NEXT:    s_setpc_b64 s[30:31]
20506;
20507; GFX10-LABEL: v_minnum_v32bf16:
20508; GFX10:       ; %bb.0:
20509; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20510; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
20511; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
20512; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
20513; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
20514; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
20515; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
20516; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
20517; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
20518; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
20519; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
20520; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
20521; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
20522; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
20523; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
20524; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
20525; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
20526; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
20527; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
20528; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
20529; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
20530; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
20531; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
20532; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v6
20533; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
20534; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
20535; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
20536; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
20537; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
20538; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
20539; GFX10-NEXT:    v_min_f32_e32 v33, v34, v33
20540; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
20541; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
20542; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
20543; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
20544; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
20545; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
20546; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
20547; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
20548; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
20549; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
20550; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
20551; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
20552; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
20553; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
20554; GFX10-NEXT:    v_min_f32_e32 v27, v50, v27
20555; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v0
20556; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
20557; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
20558; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
20559; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
20560; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
20561; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
20562; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
20563; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
20564; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
20565; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v18
20566; GFX10-NEXT:    v_min_f32_e32 v29, v38, v29
20567; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v2
20568; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
20569; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
20570; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
20571; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
20572; GFX10-NEXT:    v_min_f32_e32 v28, v48, v28
20573; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
20574; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
20575; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
20576; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
20577; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
20578; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
20579; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
20580; GFX10-NEXT:    v_min_f32_e32 v34, v34, v51
20581; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
20582; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
20583; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
20584; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
20585; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
20586; GFX10-NEXT:    v_min_f32_e32 v30, v36, v30
20587; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v3
20588; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
20589; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
20590; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
20591; GFX10-NEXT:    v_min_f32_e32 v18, v48, v23
20592; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
20593; GFX10-NEXT:    v_min_f32_e32 v17, v50, v22
20594; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v33
20595; GFX10-NEXT:    v_bfe_u32 v23, v14, 16, 1
20596; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
20597; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
20598; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
20599; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
20600; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
20601; GFX10-NEXT:    v_min_f32_e32 v20, v36, v25
20602; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
20603; GFX10-NEXT:    v_min_f32_e32 v19, v38, v24
20604; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v14
20605; GFX10-NEXT:    v_bfe_u32 v25, v35, 16, 1
20606; GFX10-NEXT:    v_add3_u32 v23, v23, v14, 0x7fff
20607; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
20608; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
20609; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
20610; GFX10-NEXT:    v_min_f32_e32 v21, v51, v26
20611; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v35
20612; GFX10-NEXT:    v_bfe_u32 v36, v13, 16, 1
20613; GFX10-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
20614; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
20615; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
20616; GFX10-NEXT:    v_or_b32_e32 v38, 0x400000, v13
20617; GFX10-NEXT:    v_bfe_u32 v48, v37, 16, 1
20618; GFX10-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
20619; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v37
20620; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
20621; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
20622; GFX10-NEXT:    v_bfe_u32 v51, v12, 16, 1
20623; GFX10-NEXT:    v_add3_u32 v48, v48, v37, 0x7fff
20624; GFX10-NEXT:    v_or_b32_e32 v33, 0x400000, v12
20625; GFX10-NEXT:    v_bfe_u32 v22, v39, 16, 1
20626; GFX10-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc_lo
20627; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
20628; GFX10-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
20629; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v39
20630; GFX10-NEXT:    v_bfe_u32 v24, v11, 16, 1
20631; GFX10-NEXT:    v_add3_u32 v22, v22, v39, 0x7fff
20632; GFX10-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc_lo
20633; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
20634; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v11
20635; GFX10-NEXT:    v_bfe_u32 v26, v49, 16, 1
20636; GFX10-NEXT:    v_add3_u32 v24, v24, v11, 0x7fff
20637; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v49
20638; GFX10-NEXT:    v_cndmask_b32_e32 v33, v51, v33, vcc_lo
20639; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
20640; GFX10-NEXT:    v_bfe_u32 v38, v10, 16, 1
20641; GFX10-NEXT:    v_add3_u32 v26, v26, v49, 0x7fff
20642; GFX10-NEXT:    v_or_b32_e32 v37, 0x400000, v10
20643; GFX10-NEXT:    v_bfe_u32 v50, v34, 16, 1
20644; GFX10-NEXT:    v_cndmask_b32_e32 v14, v22, v14, vcc_lo
20645; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
20646; GFX10-NEXT:    v_add3_u32 v38, v38, v10, 0x7fff
20647; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v34
20648; GFX10-NEXT:    v_bfe_u32 v51, v9, 16, 1
20649; GFX10-NEXT:    v_add3_u32 v50, v50, v34, 0x7fff
20650; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v35, vcc_lo
20651; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
20652; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v9
20653; GFX10-NEXT:    v_bfe_u32 v22, v30, 16, 1
20654; GFX10-NEXT:    v_add3_u32 v51, v51, v9, 0x7fff
20655; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v30
20656; GFX10-NEXT:    v_cndmask_b32_e32 v13, v26, v13, vcc_lo
20657; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
20658; GFX10-NEXT:    v_bfe_u32 v35, v8, 16, 1
20659; GFX10-NEXT:    v_add3_u32 v22, v22, v30, 0x7fff
20660; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v8
20661; GFX10-NEXT:    v_bfe_u32 v26, v29, 16, 1
20662; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
20663; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
20664; GFX10-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
20665; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v29
20666; GFX10-NEXT:    v_bfe_u32 v38, v7, 16, 1
20667; GFX10-NEXT:    v_add3_u32 v26, v26, v29, 0x7fff
20668; GFX10-NEXT:    v_cndmask_b32_e32 v12, v50, v12, vcc_lo
20669; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
20670; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v7
20671; GFX10-NEXT:    v_bfe_u32 v50, v28, 16, 1
20672; GFX10-NEXT:    v_add3_u32 v38, v38, v7, 0x7fff
20673; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v28
20674; GFX10-NEXT:    v_cndmask_b32_e32 v39, v51, v39, vcc_lo
20675; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
20676; GFX10-NEXT:    v_bfe_u32 v51, v6, 16, 1
20677; GFX10-NEXT:    v_add3_u32 v50, v50, v28, 0x7fff
20678; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v6
20679; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
20680; GFX10-NEXT:    v_cndmask_b32_e32 v11, v22, v11, vcc_lo
20681; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
20682; GFX10-NEXT:    v_bfe_u32 v22, v27, 16, 1
20683; GFX10-NEXT:    v_add3_u32 v51, v51, v6, 0x7fff
20684; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v27
20685; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
20686; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v49, vcc_lo
20687; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
20688; GFX10-NEXT:    v_bfe_u32 v49, v5, 16, 1
20689; GFX10-NEXT:    v_add3_u32 v22, v22, v27, 0x7fff
20690; GFX10-NEXT:    v_or_b32_e32 v29, 0x400000, v5
20691; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
20692; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
20693; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
20694; GFX10-NEXT:    v_add3_u32 v49, v49, v5, 0x7fff
20695; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v21
20696; GFX10-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
20697; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
20698; GFX10-NEXT:    v_bfe_u32 v38, v4, 16, 1
20699; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
20700; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v4
20701; GFX10-NEXT:    v_cndmask_b32_e32 v9, v50, v9, vcc_lo
20702; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
20703; GFX10-NEXT:    v_bfe_u32 v50, v20, 16, 1
20704; GFX10-NEXT:    v_add3_u32 v38, v38, v4, 0x7fff
20705; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v20
20706; GFX10-NEXT:    v_cndmask_b32_e32 v30, v51, v30, vcc_lo
20707; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
20708; GFX10-NEXT:    v_add3_u32 v50, v50, v20, 0x7fff
20709; GFX10-NEXT:    v_bfe_u32 v51, v3, 16, 1
20710; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v3
20711; GFX10-NEXT:    v_cndmask_b32_e32 v8, v22, v8, vcc_lo
20712; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
20713; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
20714; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v19
20715; GFX10-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
20716; GFX10-NEXT:    v_cndmask_b32_e32 v29, v49, v29, vcc_lo
20717; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
20718; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
20719; GFX10-NEXT:    v_bfe_u32 v49, v2, 16, 1
20720; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v2
20721; GFX10-NEXT:    v_cndmask_b32_e32 v7, v26, v7, vcc_lo
20722; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
20723; GFX10-NEXT:    v_bfe_u32 v26, v18, 16, 1
20724; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v18
20725; GFX10-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
20726; GFX10-NEXT:    v_cndmask_b32_e32 v28, v38, v28, vcc_lo
20727; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
20728; GFX10-NEXT:    v_bfe_u32 v38, v1, 16, 1
20729; GFX10-NEXT:    v_add3_u32 v26, v26, v18, 0x7fff
20730; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v1
20731; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
20732; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
20733; GFX10-NEXT:    v_bfe_u32 v50, v17, 16, 1
20734; GFX10-NEXT:    v_add3_u32 v38, v38, v1, 0x7fff
20735; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v17
20736; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
20737; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
20738; GFX10-NEXT:    v_bfe_u32 v22, v0, 16, 1
20739; GFX10-NEXT:    v_add3_u32 v50, v50, v17, 0x7fff
20740; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v0
20741; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v4, vcc_lo
20742; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
20743; GFX10-NEXT:    v_add3_u32 v22, v22, v0, 0x7fff
20744; GFX10-NEXT:    v_cndmask_b32_e32 v1, v38, v20, vcc_lo
20745; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
20746; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
20747; GFX10-NEXT:    v_cndmask_b32_e32 v17, v50, v19, vcc_lo
20748; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
20749; GFX10-NEXT:    v_perm_b32 v4, v28, v7, 0x7060302
20750; GFX10-NEXT:    v_perm_b32 v7, v34, v10, 0x7060302
20751; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v18, vcc_lo
20752; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
20753; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
20754; GFX10-NEXT:    v_cndmask_b32_e32 v2, v49, v21, vcc_lo
20755; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
20756; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
20757; GFX10-NEXT:    v_cndmask_b32_e32 v3, v51, v27, vcc_lo
20758; GFX10-NEXT:    v_perm_b32 v5, v29, v8, 0x7060302
20759; GFX10-NEXT:    v_perm_b32 v8, v35, v11, 0x7060302
20760; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
20761; GFX10-NEXT:    v_perm_b32 v6, v30, v9, 0x7060302
20762; GFX10-NEXT:    v_perm_b32 v9, v39, v12, 0x7060302
20763; GFX10-NEXT:    s_waitcnt vmcnt(0)
20764; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
20765; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
20766; GFX10-NEXT:    v_min_f32_e32 v17, v31, v17
20767; GFX10-NEXT:    v_min_f32_e32 v15, v15, v18
20768; GFX10-NEXT:    v_bfe_u32 v10, v17, 16, 1
20769; GFX10-NEXT:    v_bfe_u32 v11, v15, 16, 1
20770; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v17
20771; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
20772; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v15
20773; GFX10-NEXT:    v_add3_u32 v18, v10, v17, 0x7fff
20774; GFX10-NEXT:    v_add3_u32 v11, v11, v15, 0x7fff
20775; GFX10-NEXT:    v_perm_b32 v10, v37, v13, 0x7060302
20776; GFX10-NEXT:    v_perm_b32 v13, v36, v25, 0x7060302
20777; GFX10-NEXT:    v_cndmask_b32_e32 v17, v18, v12, vcc_lo
20778; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
20779; GFX10-NEXT:    v_perm_b32 v12, v33, v48, 0x7060302
20780; GFX10-NEXT:    v_cndmask_b32_e32 v15, v11, v19, vcc_lo
20781; GFX10-NEXT:    v_perm_b32 v11, v24, v14, 0x7060302
20782; GFX10-NEXT:    v_perm_b32 v14, v23, v16, 0x7060302
20783; GFX10-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
20784; GFX10-NEXT:    s_setpc_b64 s[30:31]
20785;
20786; GFX11-LABEL: v_minnum_v32bf16:
20787; GFX11:       ; %bb.0:
20788; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20789; GFX11-NEXT:    scratch_load_b32 v32, off, s32
20790; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
20791; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
20792; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
20793; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
20794; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
20795; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
20796; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
20797; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
20798; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
20799; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
20800; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
20801; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20802; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
20803; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
20804; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
20805; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
20806; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
20807; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
20808; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
20809; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
20810; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
20811; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
20812; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
20813; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
20814; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
20815; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
20816; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20817; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
20818; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
20819; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
20820; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
20821; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
20822; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
20823; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
20824; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
20825; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
20826; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
20827; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
20828; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
20829; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
20830; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
20831; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
20832; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
20833; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
20834; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
20835; GFX11-NEXT:    v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
20836; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
20837; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
20838; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
20839; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
20840; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
20841; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
20842; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
20843; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
20844; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
20845; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
20846; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
20847; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
20848; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
20849; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
20850; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
20851; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
20852; GFX11-NEXT:    v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
20853; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
20854; GFX11-NEXT:    v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
20855; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
20856; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
20857; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
20858; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
20859; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
20860; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
20861; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
20862; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
20863; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
20864; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
20865; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
20866; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
20867; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
20868; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
20869; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
20870; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
20871; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
20872; GFX11-NEXT:    v_min_f32_e32 v20, v80, v71
20873; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
20874; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
20875; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
20876; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
20877; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
20878; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
20879; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
20880; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
20881; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
20882; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
20883; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
20884; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
20885; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
20886; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
20887; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
20888; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
20889; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
20890; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
20891; GFX11-NEXT:    v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
20892; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
20893; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20894; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
20895; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
20896; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
20897; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
20898; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
20899; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
20900; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
20901; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
20902; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
20903; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
20904; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
20905; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
20906; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
20907; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
20908; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
20909; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
20910; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
20911; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
20912; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
20913; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
20914; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
20915; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
20916; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
20917; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
20918; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
20919; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
20920; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
20921; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
20922; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
20923; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
20924; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
20925; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
20926; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
20927; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
20928; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
20929; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
20930; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
20931; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
20932; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
20933; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
20934; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
20935; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
20936; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
20937; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
20938; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
20939; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
20940; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
20941; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
20942; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
20943; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
20944; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
20945; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
20946; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
20947; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
20948; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
20949; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
20950; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
20951; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
20952; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
20953; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
20954; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
20955; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
20956; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
20957; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
20958; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
20959; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
20960; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
20961; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
20962; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
20963; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
20964; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
20965; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
20966; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
20967; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
20968; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
20969; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
20970; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
20971; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
20972; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
20973; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
20974; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
20975; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
20976; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
20977; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
20978; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
20979; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
20980; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
20981; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
20982; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
20983; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
20984; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
20985; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
20986; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
20987; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
20988; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
20989; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
20990; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
20991; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
20992; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
20993; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
20994; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
20995; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
20996; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
20997; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
20998; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
20999; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
21000; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
21001; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
21002; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
21003; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
21004; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
21005; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21006; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
21007; GFX11-NEXT:    v_cndmask_b32_e32 v23, v97, v98, vcc_lo
21008; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
21009; GFX11-NEXT:    v_cndmask_b32_e32 v6, v99, v100, vcc_lo
21010; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
21011; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
21012; GFX11-NEXT:    v_cndmask_b32_e32 v22, v101, v102, vcc_lo
21013; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
21014; GFX11-NEXT:    v_cndmask_b32_e32 v5, v103, v112, vcc_lo
21015; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
21016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21017; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
21018; GFX11-NEXT:    v_cndmask_b32_e32 v21, v113, v114, vcc_lo
21019; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21020; GFX11-NEXT:    v_cndmask_b32_e32 v4, v115, v116, vcc_lo
21021; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
21022; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
21023; GFX11-NEXT:    v_cndmask_b32_e32 v20, v117, v118, vcc_lo
21024; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
21025; GFX11-NEXT:    v_cndmask_b32_e32 v19, v129, v130, vcc_lo
21026; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
21027; GFX11-NEXT:    v_cndmask_b32_e32 v18, v133, v134, vcc_lo
21028; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21029; GFX11-NEXT:    v_cndmask_b32_e32 v1, v135, v144, vcc_lo
21030; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
21031; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21032; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
21033; GFX11-NEXT:    v_cndmask_b32_e32 v17, v145, v146, vcc_lo
21034; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21035; GFX11-NEXT:    v_cndmask_b32_e32 v0, v147, v33, vcc_lo
21036; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
21037; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
21038; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
21039; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
21040; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
21041; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
21042; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
21043; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
21044; GFX11-NEXT:    s_waitcnt vmcnt(0)
21045; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
21046; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21047; GFX11-NEXT:    v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
21048; GFX11-NEXT:    v_min_f32_e32 v15, v15, v18
21049; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21050; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
21051; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
21052; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
21053; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
21054; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
21055; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
21056; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
21057; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
21058; GFX11-NEXT:    v_cndmask_b32_e32 v17, v18, v20, vcc_lo
21059; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
21060; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
21061; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21062; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
21063; GFX11-NEXT:    s_setpc_b64 s[30:31]
21064  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
21065  ret <32 x bfloat> %op
21066}
21067
21068
21069declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
21070declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
21071declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
21072declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
21073declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
21074declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
21075declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
21076
21077define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
21078; GCN-LABEL: v_maxnum_bf16:
21079; GCN:       ; %bb.0:
21080; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21081; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21082; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21083; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21084; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21085; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
21086; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21087; GCN-NEXT:    s_setpc_b64 s[30:31]
21088;
21089; GFX7-LABEL: v_maxnum_bf16:
21090; GFX7:       ; %bb.0:
21091; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21092; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21093; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21094; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21095; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21096; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
21097; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21098; GFX7-NEXT:    s_setpc_b64 s[30:31]
21099;
21100; GFX8-LABEL: v_maxnum_bf16:
21101; GFX8:       ; %bb.0:
21102; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21103; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21104; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
21105; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
21106; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
21107; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
21108; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
21109; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
21110; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21111; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
21112; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21113; GFX8-NEXT:    s_setpc_b64 s[30:31]
21114;
21115; GFX9-LABEL: v_maxnum_bf16:
21116; GFX9:       ; %bb.0:
21117; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21118; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21119; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
21120; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
21121; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
21122; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
21123; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
21124; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
21125; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21126; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
21127; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21128; GFX9-NEXT:    s_setpc_b64 s[30:31]
21129;
21130; GFX10-LABEL: v_maxnum_bf16:
21131; GFX10:       ; %bb.0:
21132; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21133; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21134; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
21135; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
21136; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
21137; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
21138; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21139; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
21140; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21141; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21142; GFX10-NEXT:    s_setpc_b64 s[30:31]
21143;
21144; GFX11-LABEL: v_maxnum_bf16:
21145; GFX11:       ; %bb.0:
21146; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21147; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21148; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
21149; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21150; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
21151; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
21152; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
21153; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21154; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
21155; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
21156; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21157; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21158; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21159; GFX11-NEXT:    s_setpc_b64 s[30:31]
21160  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
21161  ret bfloat %op
21162}
21163
21164define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
21165; GCN-LABEL: v_maxnum_v2bf16:
21166; GCN:       ; %bb.0:
21167; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21168; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21169; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21170; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21171; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21172; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21173; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21174; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21175; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21176; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
21177; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
21178; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21179; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21180; GCN-NEXT:    s_setpc_b64 s[30:31]
21181;
21182; GFX7-LABEL: v_maxnum_v2bf16:
21183; GFX7:       ; %bb.0:
21184; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21185; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21186; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21187; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21188; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21189; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21190; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21191; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21192; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21193; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
21194; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
21195; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21196; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21197; GFX7-NEXT:    s_setpc_b64 s[30:31]
21198;
21199; GFX8-LABEL: v_maxnum_v2bf16:
21200; GFX8:       ; %bb.0:
21201; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21202; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
21203; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
21204; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
21205; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
21206; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
21207; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21208; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21209; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
21210; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
21211; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
21212; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
21213; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
21214; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
21215; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
21216; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
21217; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
21218; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21219; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
21220; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21221; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
21222; GFX8-NEXT:    s_setpc_b64 s[30:31]
21223;
21224; GFX9-LABEL: v_maxnum_v2bf16:
21225; GFX9:       ; %bb.0:
21226; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21227; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
21228; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
21229; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
21230; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21231; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21232; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
21233; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
21234; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
21235; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
21236; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
21237; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
21238; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
21239; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
21240; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
21241; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
21242; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21243; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
21244; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
21245; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
21246; GFX9-NEXT:    s_setpc_b64 s[30:31]
21247;
21248; GFX10-LABEL: v_maxnum_v2bf16:
21249; GFX10:       ; %bb.0:
21250; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21251; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
21252; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
21253; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21254; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21255; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
21256; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
21257; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
21258; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
21259; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
21260; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
21261; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
21262; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
21263; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
21264; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21265; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21266; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21267; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
21268; GFX10-NEXT:    s_setpc_b64 s[30:31]
21269;
21270; GFX11-LABEL: v_maxnum_v2bf16:
21271; GFX11:       ; %bb.0:
21272; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21273; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
21274; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21275; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
21276; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21277; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
21278; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
21279; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
21280; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21281; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
21282; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
21283; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
21284; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
21285; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
21286; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
21287; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
21288; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
21289; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21290; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21291; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21292; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21293; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
21294; GFX11-NEXT:    s_setpc_b64 s[30:31]
21295  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
21296  ret <2 x bfloat> %op
21297}
21298
21299define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
21300; GCN-LABEL: v_maxnum_v3bf16:
21301; GCN:       ; %bb.0:
21302; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21303; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21304; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21305; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21306; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21307; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21308; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21309; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21310; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21311; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21312; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21313; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21314; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21315; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
21316; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
21317; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
21318; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21319; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21320; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21321; GCN-NEXT:    s_setpc_b64 s[30:31]
21322;
21323; GFX7-LABEL: v_maxnum_v3bf16:
21324; GFX7:       ; %bb.0:
21325; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21326; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21327; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21328; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21329; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21330; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21331; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21332; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21333; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21334; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21335; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21336; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21337; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21338; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
21339; GFX7-NEXT:    v_max_f32_e32 v1, v1, v4
21340; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
21341; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21342; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21343; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21344; GFX7-NEXT:    s_setpc_b64 s[30:31]
21345;
21346; GFX8-LABEL: v_maxnum_v3bf16:
21347; GFX8:       ; %bb.0:
21348; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21349; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
21350; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21351; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
21352; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
21353; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
21354; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
21355; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
21356; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21357; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
21358; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
21359; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
21360; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
21361; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
21362; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
21363; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
21364; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21365; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21366; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
21367; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
21368; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
21369; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21370; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
21371; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
21372; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
21373; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
21374; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
21375; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21376; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
21377; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21378; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
21379; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
21380; GFX8-NEXT:    s_setpc_b64 s[30:31]
21381;
21382; GFX9-LABEL: v_maxnum_v3bf16:
21383; GFX9:       ; %bb.0:
21384; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21385; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
21386; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21387; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
21388; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
21389; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
21390; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
21391; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
21392; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21393; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
21394; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
21395; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
21396; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
21397; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21398; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21399; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
21400; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
21401; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
21402; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
21403; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21404; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
21405; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
21406; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
21407; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
21408; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21409; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
21410; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
21411; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
21412; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
21413; GFX9-NEXT:    s_setpc_b64 s[30:31]
21414;
21415; GFX10-LABEL: v_maxnum_v3bf16:
21416; GFX10:       ; %bb.0:
21417; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21418; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
21419; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
21420; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21421; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21422; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
21423; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
21424; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
21425; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
21426; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
21427; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
21428; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
21429; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
21430; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21431; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
21432; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
21433; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
21434; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
21435; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
21436; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
21437; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21438; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21439; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21440; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21441; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
21442; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21443; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
21444; GFX10-NEXT:    s_setpc_b64 s[30:31]
21445;
21446; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
21447; GFX11TRUE16:       ; %bb.0:
21448; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21449; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
21450; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
21451; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
21452; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21453; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21454; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21455; GFX11TRUE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21456; GFX11TRUE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21457; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21458; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
21459; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
21460; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21461; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
21462; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
21463; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
21464; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
21465; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
21466; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
21467; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
21468; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21469; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21470; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21471; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21472; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21473; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
21474; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21475; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21476; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
21477; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
21478;
21479; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
21480; GFX11FAKE16:       ; %bb.0:
21481; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21482; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
21483; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
21484; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
21485; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21486; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21487; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21488; GFX11FAKE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21489; GFX11FAKE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21490; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21491; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
21492; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
21493; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21494; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
21495; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
21496; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
21497; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
21498; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
21499; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
21500; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
21501; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21502; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21503; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21504; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21505; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21506; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
21507; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21508; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21509; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
21510; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
21511  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
21512  ret <3 x bfloat> %op
21513}
21514
21515define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
21516; GCN-LABEL: v_maxnum_v4bf16:
21517; GCN:       ; %bb.0:
21518; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21519; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21520; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21521; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21522; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21523; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21524; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
21525; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21526; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
21527; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21528; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21529; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21530; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21531; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21532; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21533; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21534; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21535; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
21536; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
21537; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
21538; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
21539; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21540; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21541; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21542; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21543; GCN-NEXT:    s_setpc_b64 s[30:31]
21544;
21545; GFX7-LABEL: v_maxnum_v4bf16:
21546; GFX7:       ; %bb.0:
21547; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21548; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21549; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21550; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21551; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21552; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21553; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
21554; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21555; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
21556; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21557; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21558; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21559; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21560; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21561; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21562; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21563; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21564; GFX7-NEXT:    v_max_f32_e32 v3, v3, v7
21565; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
21566; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
21567; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
21568; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21569; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21570; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21571; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21572; GFX7-NEXT:    s_setpc_b64 s[30:31]
21573;
21574; GFX8-LABEL: v_maxnum_v4bf16:
21575; GFX8:       ; %bb.0:
21576; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21577; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
21578; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
21579; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
21580; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
21581; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
21582; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21583; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21584; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
21585; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
21586; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
21587; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
21588; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
21589; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
21590; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
21591; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
21592; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
21593; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
21594; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21595; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
21596; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
21597; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
21598; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
21599; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
21600; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
21601; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21602; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21603; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
21604; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
21605; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
21606; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21607; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
21608; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
21609; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
21610; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
21611; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
21612; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21613; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
21614; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
21615; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21616; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
21617; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
21618; GFX8-NEXT:    s_setpc_b64 s[30:31]
21619;
21620; GFX9-LABEL: v_maxnum_v4bf16:
21621; GFX9:       ; %bb.0:
21622; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21623; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
21624; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
21625; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
21626; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21627; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21628; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
21629; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
21630; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
21631; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
21632; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
21633; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
21634; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
21635; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
21636; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
21637; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
21638; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21639; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
21640; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
21641; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
21642; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
21643; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21644; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21645; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
21646; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
21647; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
21648; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
21649; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21650; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
21651; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
21652; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
21653; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
21654; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21655; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
21656; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
21657; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
21658; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
21659; GFX9-NEXT:    s_setpc_b64 s[30:31]
21660;
21661; GFX10-LABEL: v_maxnum_v4bf16:
21662; GFX10:       ; %bb.0:
21663; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21664; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
21665; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
21666; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21667; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21668; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
21669; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
21670; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
21671; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21672; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21673; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
21674; GFX10-NEXT:    v_max_f32_e32 v3, v7, v6
21675; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
21676; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
21677; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
21678; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21679; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
21680; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
21681; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
21682; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
21683; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
21684; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
21685; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21686; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
21687; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
21688; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
21689; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
21690; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
21691; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21692; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21693; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21694; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21695; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
21696; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21697; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
21698; GFX10-NEXT:    s_setpc_b64 s[30:31]
21699;
21700; GFX11-LABEL: v_maxnum_v4bf16:
21701; GFX11:       ; %bb.0:
21702; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21703; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
21704; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
21705; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21706; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21707; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
21708; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
21709; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21710; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
21711; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21712; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
21713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21714; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
21715; GFX11-NEXT:    v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
21716; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
21717; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
21718; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
21719; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
21720; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
21721; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
21722; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
21723; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
21724; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
21725; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
21726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
21727; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21728; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
21729; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
21730; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
21731; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
21732; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21733; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
21734; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21735; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21736; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
21737; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
21738; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21739; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
21740; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
21741; GFX11-NEXT:    s_setpc_b64 s[30:31]
21742  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
21743  ret <4 x bfloat> %op
21744}
21745
21746define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
21747; GCN-LABEL: v_maxnum_v8bf16:
21748; GCN:       ; %bb.0:
21749; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21750; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21751; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
21752; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21753; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
21754; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21755; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
21756; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21757; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
21758; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21759; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
21760; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21761; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
21762; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
21763; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
21764; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
21765; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
21766; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
21767; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21768; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
21769; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21770; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
21771; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21772; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
21773; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21774; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
21775; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21776; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
21777; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21778; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
21779; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21780; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
21781; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21782; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
21783; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
21784; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
21785; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
21786; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
21787; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
21788; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
21789; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
21790; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21791; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21792; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21793; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21794; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21795; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21796; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21797; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21798; GCN-NEXT:    s_setpc_b64 s[30:31]
21799;
21800; GFX7-LABEL: v_maxnum_v8bf16:
21801; GFX7:       ; %bb.0:
21802; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21803; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
21804; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
21805; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
21806; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
21807; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
21808; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
21809; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
21810; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
21811; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
21812; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
21813; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
21814; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
21815; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
21816; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
21817; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
21818; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
21819; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
21820; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21821; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
21822; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21823; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
21824; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21825; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
21826; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21827; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
21828; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21829; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
21830; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21831; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
21832; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21833; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
21834; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21835; GFX7-NEXT:    v_max_f32_e32 v7, v7, v15
21836; GFX7-NEXT:    v_max_f32_e32 v6, v6, v14
21837; GFX7-NEXT:    v_max_f32_e32 v5, v5, v13
21838; GFX7-NEXT:    v_max_f32_e32 v4, v4, v12
21839; GFX7-NEXT:    v_max_f32_e32 v3, v3, v11
21840; GFX7-NEXT:    v_max_f32_e32 v2, v2, v10
21841; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
21842; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
21843; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21844; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21845; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21846; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21847; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21848; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21849; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21850; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21851; GFX7-NEXT:    s_setpc_b64 s[30:31]
21852;
21853; GFX8-LABEL: v_maxnum_v8bf16:
21854; GFX8:       ; %bb.0:
21855; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21856; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
21857; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
21858; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
21859; GFX8-NEXT:    v_bfe_u32 v9, v8, 16, 1
21860; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
21861; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21862; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21863; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
21864; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
21865; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
21866; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
21867; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
21868; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
21869; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
21870; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
21871; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
21872; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
21873; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21874; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
21875; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
21876; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
21877; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
21878; GFX8-NEXT:    v_bfe_u32 v9, v7, 16, 1
21879; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
21880; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21881; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21882; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
21883; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
21884; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
21885; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
21886; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
21887; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
21888; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
21889; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
21890; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
21891; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
21892; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
21893; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
21894; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
21895; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
21896; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
21897; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
21898; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21899; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21900; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
21901; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
21902; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
21903; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
21904; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
21905; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
21906; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
21907; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
21908; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
21909; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21910; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
21911; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
21912; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
21913; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
21914; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
21915; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
21916; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21917; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21918; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
21919; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
21920; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
21921; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
21922; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
21923; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
21924; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
21925; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
21926; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
21927; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
21928; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
21929; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
21930; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
21931; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
21932; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
21933; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
21934; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
21935; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
21936; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
21937; GFX8-NEXT:    s_setpc_b64 s[30:31]
21938;
21939; GFX9-LABEL: v_maxnum_v8bf16:
21940; GFX9:       ; %bb.0:
21941; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21942; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
21943; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
21944; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
21945; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
21946; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
21947; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
21948; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
21949; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
21950; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
21951; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
21952; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
21953; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
21954; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
21955; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
21956; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
21957; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
21958; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
21959; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
21960; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
21961; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
21962; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
21963; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
21964; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
21965; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
21966; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
21967; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
21968; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
21969; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
21970; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
21971; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
21972; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
21973; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
21974; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
21975; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
21976; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
21977; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
21978; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
21979; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
21980; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
21981; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
21982; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
21983; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
21984; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
21985; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
21986; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
21987; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
21988; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
21989; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
21990; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
21991; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
21992; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
21993; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
21994; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
21995; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
21996; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
21997; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
21998; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
21999; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
22000; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
22001; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
22002; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
22003; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
22004; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
22005; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
22006; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
22007; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
22008; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
22009; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
22010; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
22011; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
22012; GFX9-NEXT:    s_setpc_b64 s[30:31]
22013;
22014; GFX10-LABEL: v_maxnum_v8bf16:
22015; GFX10:       ; %bb.0:
22016; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22017; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
22018; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
22019; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22020; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22021; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
22022; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22023; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
22024; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
22025; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22026; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
22027; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
22028; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
22029; GFX10-NEXT:    v_max_f32_e32 v7, v10, v9
22030; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
22031; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
22032; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
22033; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
22034; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
22035; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
22036; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
22037; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
22038; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22039; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
22040; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
22041; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
22042; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
22043; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
22044; GFX10-NEXT:    v_max_f32_e32 v6, v10, v6
22045; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
22046; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22047; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22048; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
22049; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
22050; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
22051; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
22052; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22053; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22054; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
22055; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
22056; GFX10-NEXT:    v_max_f32_e32 v5, v15, v13
22057; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
22058; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
22059; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22060; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
22061; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
22062; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
22063; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
22064; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
22065; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
22066; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
22067; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
22068; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
22069; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
22070; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
22071; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
22072; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
22073; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
22074; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
22075; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22076; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
22077; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22078; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
22079; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
22080; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22081; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
22082; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
22083; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22084; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
22085; GFX10-NEXT:    s_setpc_b64 s[30:31]
22086;
22087; GFX11-LABEL: v_maxnum_v8bf16:
22088; GFX11:       ; %bb.0:
22089; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22090; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
22091; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
22092; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22093; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
22094; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
22095; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22096; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
22097; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
22098; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
22099; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22100; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
22101; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22102; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
22103; GFX11-NEXT:    v_max_f32_e32 v7, v10, v9
22104; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
22105; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
22106; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22107; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
22108; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
22109; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
22110; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
22111; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22112; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
22113; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
22114; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
22115; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
22116; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22117; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
22118; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22119; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22120; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
22121; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
22122; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22123; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
22124; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22125; GFX11-NEXT:    v_max_f32_e32 v6, v10, v6
22126; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
22127; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
22128; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
22129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22130; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
22131; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22132; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
22133; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
22134; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
22135; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22136; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22137; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
22138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22139; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
22140; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
22141; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
22142; GFX11-NEXT:    v_max_f32_e32 v5, v15, v13
22143; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22144; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
22145; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
22146; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
22147; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
22148; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22149; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
22150; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
22151; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
22152; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
22153; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
22154; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
22155; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
22156; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22157; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
22158; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22159; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
22160; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22161; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
22162; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22163; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
22164; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
22165; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22166; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
22167; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
22168; GFX11-NEXT:    s_setpc_b64 s[30:31]
22169  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
22170  ret <8 x bfloat> %op
22171}
22172
22173define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
22174; GCN-LABEL: v_maxnum_v16bf16:
22175; GCN:       ; %bb.0:
22176; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22177; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
22178; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
22179; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
22180; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22181; GCN-NEXT:    v_max_f32_e32 v14, v14, v30
22182; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
22183; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
22184; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
22185; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22186; GCN-NEXT:    v_max_f32_e32 v13, v13, v29
22187; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
22188; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
22189; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
22190; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22191; GCN-NEXT:    v_max_f32_e32 v12, v12, v28
22192; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
22193; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
22194; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
22195; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
22196; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
22197; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
22198; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
22199; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
22200; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
22201; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
22202; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
22203; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
22204; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
22205; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
22206; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
22207; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
22208; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
22209; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
22210; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
22211; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
22212; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
22213; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
22214; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
22215; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
22216; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
22217; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
22218; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22219; GCN-NEXT:    v_max_f32_e32 v11, v11, v27
22220; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32
22221; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
22222; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22223; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
22224; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22225; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
22226; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22227; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
22228; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22229; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
22230; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22231; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
22232; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22233; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
22234; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22235; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
22236; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22237; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
22238; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22239; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
22240; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22241; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
22242; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22243; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22244; GCN-NEXT:    v_max_f32_e32 v10, v10, v26
22245; GCN-NEXT:    v_max_f32_e32 v9, v9, v25
22246; GCN-NEXT:    v_max_f32_e32 v8, v8, v24
22247; GCN-NEXT:    v_max_f32_e32 v7, v7, v23
22248; GCN-NEXT:    v_max_f32_e32 v6, v6, v22
22249; GCN-NEXT:    v_max_f32_e32 v5, v5, v21
22250; GCN-NEXT:    v_max_f32_e32 v4, v4, v20
22251; GCN-NEXT:    v_max_f32_e32 v3, v3, v19
22252; GCN-NEXT:    v_max_f32_e32 v2, v2, v18
22253; GCN-NEXT:    v_max_f32_e32 v1, v1, v17
22254; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
22255; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22256; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22257; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22258; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22259; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22260; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22261; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22262; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22263; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22264; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22265; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22266; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22267; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22268; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22269; GCN-NEXT:    s_waitcnt vmcnt(0)
22270; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v27
22271; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
22272; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
22273; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22274; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22275; GCN-NEXT:    s_setpc_b64 s[30:31]
22276;
22277; GFX7-LABEL: v_maxnum_v16bf16:
22278; GFX7:       ; %bb.0:
22279; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22280; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
22281; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
22282; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
22283; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22284; GFX7-NEXT:    v_max_f32_e32 v11, v11, v27
22285; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32
22286; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
22287; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
22288; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
22289; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22290; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
22291; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
22292; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
22293; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
22294; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
22295; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
22296; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
22297; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
22298; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
22299; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
22300; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
22301; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
22302; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
22303; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
22304; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
22305; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
22306; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
22307; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
22308; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
22309; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
22310; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
22311; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
22312; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
22313; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
22314; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
22315; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
22316; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
22317; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
22318; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
22319; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22320; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
22321; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22322; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
22323; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22324; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
22325; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22326; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
22327; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22328; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
22329; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22330; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
22331; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22332; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22333; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
22334; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22335; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
22336; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22337; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
22338; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22339; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
22340; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22341; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
22342; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22343; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
22344; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22345; GFX7-NEXT:    v_max_f32_e32 v14, v14, v30
22346; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
22347; GFX7-NEXT:    v_max_f32_e32 v12, v12, v28
22348; GFX7-NEXT:    v_max_f32_e32 v10, v10, v26
22349; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
22350; GFX7-NEXT:    v_max_f32_e32 v8, v8, v24
22351; GFX7-NEXT:    v_max_f32_e32 v7, v7, v23
22352; GFX7-NEXT:    v_max_f32_e32 v5, v5, v21
22353; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
22354; GFX7-NEXT:    v_max_f32_e32 v3, v3, v19
22355; GFX7-NEXT:    v_max_f32_e32 v2, v2, v18
22356; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
22357; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
22358; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22359; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22360; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22361; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22362; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22363; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22364; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22365; GFX7-NEXT:    s_waitcnt vmcnt(0)
22366; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v27
22367; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
22368; GFX7-NEXT:    v_max_f32_e32 v15, v15, v22
22369; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22370; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22371; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22372; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22373; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22374; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22375; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22376; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22377; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22378; GFX7-NEXT:    s_setpc_b64 s[30:31]
22379;
22380; GFX8-LABEL: v_maxnum_v16bf16:
22381; GFX8:       ; %bb.0:
22382; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22383; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
22384; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
22385; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
22386; GFX8-NEXT:    v_bfe_u32 v17, v16, 16, 1
22387; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
22388; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
22389; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22390; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22391; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22392; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
22393; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
22394; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
22395; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
22396; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
22397; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
22398; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
22399; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
22400; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
22401; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
22402; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
22403; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
22404; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
22405; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
22406; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
22407; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22408; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22409; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22410; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
22411; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
22412; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
22413; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
22414; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
22415; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
22416; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
22417; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
22418; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
22419; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
22420; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
22421; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
22422; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
22423; GFX8-NEXT:    v_bfe_u32 v17, v14, 16, 1
22424; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v14
22425; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22426; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22427; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22428; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
22429; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
22430; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
22431; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
22432; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
22433; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
22434; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
22435; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
22436; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
22437; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
22438; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
22439; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
22440; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
22441; GFX8-NEXT:    v_bfe_u32 v17, v13, 16, 1
22442; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
22443; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22444; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22445; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22446; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
22447; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
22448; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
22449; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
22450; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
22451; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
22452; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
22453; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
22454; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
22455; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
22456; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
22457; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
22458; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
22459; GFX8-NEXT:    v_bfe_u32 v17, v12, 16, 1
22460; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v12
22461; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22462; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22463; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22464; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
22465; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
22466; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
22467; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
22468; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
22469; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
22470; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
22471; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
22472; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
22473; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
22474; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
22475; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
22476; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
22477; GFX8-NEXT:    v_bfe_u32 v17, v11, 16, 1
22478; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
22479; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22480; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22481; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22482; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
22483; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
22484; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
22485; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
22486; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
22487; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
22488; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
22489; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
22490; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
22491; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
22492; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
22493; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
22494; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
22495; GFX8-NEXT:    v_bfe_u32 v17, v10, 16, 1
22496; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v10
22497; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22498; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22499; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22500; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
22501; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
22502; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
22503; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
22504; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
22505; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
22506; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
22507; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
22508; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
22509; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
22510; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
22511; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
22512; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
22513; GFX8-NEXT:    v_bfe_u32 v17, v9, 16, 1
22514; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v9
22515; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22516; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22517; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
22518; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
22519; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
22520; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
22521; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
22522; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
22523; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
22524; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
22525; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
22526; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
22527; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
22528; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
22529; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
22530; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
22531; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
22532; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
22533; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
22534; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
22535; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
22536; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
22537; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
22538; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
22539; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
22540; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
22541; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
22542; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
22543; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
22544; GFX8-NEXT:    s_setpc_b64 s[30:31]
22545;
22546; GFX9-LABEL: v_maxnum_v16bf16:
22547; GFX9:       ; %bb.0:
22548; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22549; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
22550; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
22551; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
22552; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22553; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22554; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
22555; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
22556; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
22557; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
22558; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
22559; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
22560; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
22561; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
22562; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
22563; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
22564; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
22565; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
22566; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
22567; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
22568; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
22569; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22570; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22571; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
22572; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
22573; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
22574; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
22575; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
22576; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
22577; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
22578; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
22579; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
22580; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
22581; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
22582; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
22583; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
22584; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
22585; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22586; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22587; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
22588; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
22589; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
22590; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
22591; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
22592; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
22593; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
22594; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
22595; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
22596; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
22597; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
22598; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
22599; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
22600; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
22601; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22602; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22603; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
22604; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
22605; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
22606; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
22607; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
22608; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
22609; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
22610; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
22611; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
22612; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
22613; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
22614; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
22615; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
22616; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
22617; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22618; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22619; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
22620; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
22621; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
22622; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
22623; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
22624; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
22625; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
22626; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
22627; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
22628; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
22629; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
22630; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
22631; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
22632; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
22633; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22634; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22635; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
22636; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
22637; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
22638; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
22639; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
22640; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
22641; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
22642; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
22643; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
22644; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
22645; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
22646; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
22647; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
22648; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
22649; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22650; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22651; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
22652; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
22653; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
22654; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
22655; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
22656; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
22657; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
22658; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
22659; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
22660; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
22661; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
22662; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
22663; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
22664; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
22665; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22666; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22667; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
22668; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
22669; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
22670; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
22671; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
22672; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
22673; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
22674; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
22675; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
22676; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
22677; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
22678; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
22679; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
22680; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
22681; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
22682; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
22683; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
22684; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
22685; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
22686; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
22687; GFX9-NEXT:    s_setpc_b64 s[30:31]
22688;
22689; GFX10-LABEL: v_maxnum_v16bf16:
22690; GFX10:       ; %bb.0:
22691; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22692; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
22693; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
22694; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
22695; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22696; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
22697; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22698; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
22699; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
22700; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
22701; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22702; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
22703; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
22704; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
22705; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
22706; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
22707; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
22708; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
22709; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
22710; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
22711; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
22712; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
22713; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
22714; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
22715; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
22716; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
22717; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22718; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22719; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
22720; GFX10-NEXT:    v_bfe_u32 v18, v6, 16, 1
22721; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22722; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
22723; GFX10-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
22724; GFX10-NEXT:    v_max_f32_e32 v17, v20, v19
22725; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
22726; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
22727; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
22728; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
22729; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
22730; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
22731; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
22732; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
22733; GFX10-NEXT:    v_bfe_u32 v21, v5, 16, 1
22734; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22735; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22736; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
22737; GFX10-NEXT:    v_max_f32_e32 v13, v19, v18
22738; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
22739; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
22740; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
22741; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
22742; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
22743; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
22744; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
22745; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22746; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
22747; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
22748; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
22749; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
22750; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22751; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22752; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
22753; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22754; GFX10-NEXT:    v_max_f32_e32 v12, v18, v12
22755; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
22756; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
22757; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
22758; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
22759; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
22760; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22761; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
22762; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
22763; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
22764; GFX10-NEXT:    v_bfe_u32 v20, v3, 16, 1
22765; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22766; GFX10-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
22767; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
22768; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22769; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
22770; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
22771; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
22772; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
22773; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
22774; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22775; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
22776; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
22777; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
22778; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
22779; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
22780; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22781; GFX10-NEXT:    v_bfe_u32 v19, v2, 16, 1
22782; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
22783; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
22784; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22785; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
22786; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
22787; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
22788; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22789; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
22790; GFX10-NEXT:    v_max_f32_e32 v19, v22, v20
22791; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
22792; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
22793; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22794; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22795; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
22796; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
22797; GFX10-NEXT:    v_max_f32_e32 v9, v22, v20
22798; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
22799; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
22800; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
22801; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
22802; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
22803; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
22804; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
22805; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
22806; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
22807; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22808; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
22809; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
22810; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
22811; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
22812; GFX10-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
22813; GFX10-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
22814; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22815; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
22816; GFX10-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
22817; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
22818; GFX10-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22819; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
22820; GFX10-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22821; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
22822; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
22823; GFX10-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22824; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
22825; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
22826; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22827; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
22828; GFX10-NEXT:    s_setpc_b64 s[30:31]
22829;
22830; GFX11-LABEL: v_maxnum_v16bf16:
22831; GFX11:       ; %bb.0:
22832; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22833; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
22834; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
22835; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
22836; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
22837; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22838; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
22839; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
22840; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
22841; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
22842; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
22843; GFX11-NEXT:    v_max_f32_e32 v17, v18, v17
22844; GFX11-NEXT:    v_max_f32_e32 v6, v6, v14
22845; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
22846; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
22847; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
22848; GFX11-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
22849; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
22850; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22851; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
22852; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
22853; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
22854; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
22855; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22856; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
22857; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
22858; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
22859; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
22860; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
22861; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22862; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22863; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
22864; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v13
22865; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
22866; GFX11-NEXT:    v_perm_b32 v7, v7, v15, 0x7060302
22867; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
22868; GFX11-NEXT:    v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
22869; GFX11-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
22870; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
22871; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
22872; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
22873; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
22874; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
22875; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
22876; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
22877; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
22878; GFX11-NEXT:    v_max_f32_e32 v4, v4, v12
22879; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
22880; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
22881; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
22882; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
22883; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22884; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
22885; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
22886; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
22887; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
22888; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
22889; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
22890; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22891; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
22892; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
22893; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
22894; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
22895; GFX11-NEXT:    v_max_f32_e32 v12, v18, v12
22896; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22897; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
22898; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
22899; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
22900; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
22901; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22902; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22903; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
22904; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
22905; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
22906; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
22907; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
22908; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
22909; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22910; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
22911; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
22912; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
22913; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
22914; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
22915; GFX11-NEXT:    v_add3_u32 v17, v17, v12, 0x7fff
22916; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22917; GFX11-NEXT:    v_max_f32_e32 v18, v19, v18
22918; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22919; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
22920; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
22921; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
22922; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
22923; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
22924; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22925; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
22926; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
22927; GFX11-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
22928; GFX11-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
22929; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22930; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
22931; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
22932; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
22933; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
22934; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
22935; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22936; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
22937; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
22938; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
22939; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
22940; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
22941; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22942; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
22943; GFX11-NEXT:    v_max_f32_e32 v19, v22, v20
22944; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v8
22945; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
22946; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
22947; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
22948; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
22949; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
22950; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
22951; GFX11-NEXT:    v_bfe_u32 v23, v19, 16, 1
22952; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9
22953; GFX11-NEXT:    v_max_f32_e32 v9, v22, v20
22954; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22955; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
22956; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
22957; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
22958; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
22959; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
22960; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
22961; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22962; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
22963; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
22964; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
22965; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
22966; GFX11-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
22967; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22968; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22969; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
22970; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
22971; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22972; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
22973; GFX11-NEXT:    v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22974; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
22975; GFX11-NEXT:    v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22976; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
22977; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22978; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
22979; GFX11-NEXT:    v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22980; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
22981; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
22982; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22983; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
22984; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
22985; GFX11-NEXT:    s_setpc_b64 s[30:31]
22986  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
22987  ret <16 x bfloat> %op
22988}
22989
22990define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
22991; GCN-LABEL: v_maxnum_v32bf16:
22992; GCN:       ; %bb.0:
22993; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22994; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
22995; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
22996; GCN-NEXT:    s_waitcnt vmcnt(1)
22997; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
22998; GCN-NEXT:    s_waitcnt vmcnt(0)
22999; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23000; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23001; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
23002; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
23003; GCN-NEXT:    v_max_f32_e32 v31, v31, v32
23004; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
23005; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23006; GCN-NEXT:    s_waitcnt vmcnt(0)
23007; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23008; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23009; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
23010; GCN-NEXT:    v_max_f32_e32 v30, v30, v32
23011; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
23012; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23013; GCN-NEXT:    s_waitcnt vmcnt(0)
23014; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23015; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23016; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
23017; GCN-NEXT:    v_max_f32_e32 v29, v29, v32
23018; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
23019; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23020; GCN-NEXT:    s_waitcnt vmcnt(0)
23021; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23022; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23023; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
23024; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
23025; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
23026; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23027; GCN-NEXT:    s_waitcnt vmcnt(0)
23028; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23029; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23030; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
23031; GCN-NEXT:    v_max_f32_e32 v27, v27, v32
23032; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
23033; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23034; GCN-NEXT:    s_waitcnt vmcnt(0)
23035; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23036; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23037; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
23038; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
23039; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
23040; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23041; GCN-NEXT:    s_waitcnt vmcnt(0)
23042; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23043; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23044; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
23045; GCN-NEXT:    v_max_f32_e32 v25, v25, v32
23046; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
23047; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23048; GCN-NEXT:    s_waitcnt vmcnt(0)
23049; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23050; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23051; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
23052; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
23053; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
23054; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23055; GCN-NEXT:    s_waitcnt vmcnt(0)
23056; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23057; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23058; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
23059; GCN-NEXT:    v_max_f32_e32 v23, v23, v32
23060; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
23061; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23062; GCN-NEXT:    s_waitcnt vmcnt(0)
23063; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23064; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23065; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
23066; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
23067; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
23068; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
23069; GCN-NEXT:    s_waitcnt vmcnt(0)
23070; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23071; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23072; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
23073; GCN-NEXT:    v_max_f32_e32 v21, v21, v32
23074; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
23075; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
23076; GCN-NEXT:    s_waitcnt vmcnt(0)
23077; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23078; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23079; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
23080; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
23081; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
23082; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
23083; GCN-NEXT:    s_waitcnt vmcnt(0)
23084; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23085; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23086; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
23087; GCN-NEXT:    v_max_f32_e32 v19, v19, v32
23088; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
23089; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
23090; GCN-NEXT:    s_waitcnt vmcnt(0)
23091; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23092; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23093; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
23094; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
23095; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
23096; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
23097; GCN-NEXT:    s_waitcnt vmcnt(0)
23098; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23099; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23100; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
23101; GCN-NEXT:    v_max_f32_e32 v17, v17, v32
23102; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
23103; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
23104; GCN-NEXT:    s_waitcnt vmcnt(0)
23105; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23106; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23107; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
23108; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
23109; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
23110; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23111; GCN-NEXT:    s_waitcnt vmcnt(0)
23112; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23113; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23114; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
23115; GCN-NEXT:    v_max_f32_e32 v15, v15, v32
23116; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
23117; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23118; GCN-NEXT:    s_waitcnt vmcnt(0)
23119; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23120; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23121; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
23122; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
23123; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
23124; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23125; GCN-NEXT:    s_waitcnt vmcnt(0)
23126; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23127; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23128; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
23129; GCN-NEXT:    v_max_f32_e32 v13, v13, v32
23130; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
23131; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23132; GCN-NEXT:    s_waitcnt vmcnt(0)
23133; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23134; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23135; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
23136; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
23137; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
23138; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23139; GCN-NEXT:    s_waitcnt vmcnt(0)
23140; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23141; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23142; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
23143; GCN-NEXT:    v_max_f32_e32 v11, v11, v32
23144; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
23145; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23146; GCN-NEXT:    s_waitcnt vmcnt(0)
23147; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23148; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23149; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
23150; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
23151; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
23152; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23153; GCN-NEXT:    s_waitcnt vmcnt(0)
23154; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23155; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23156; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
23157; GCN-NEXT:    v_max_f32_e32 v9, v9, v32
23158; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
23159; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23160; GCN-NEXT:    s_waitcnt vmcnt(0)
23161; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23162; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23163; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
23164; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
23165; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
23166; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23167; GCN-NEXT:    s_waitcnt vmcnt(0)
23168; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23169; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23170; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
23171; GCN-NEXT:    v_max_f32_e32 v7, v7, v32
23172; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
23173; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
23174; GCN-NEXT:    s_waitcnt vmcnt(0)
23175; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23176; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23177; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
23178; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
23179; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
23180; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
23181; GCN-NEXT:    s_waitcnt vmcnt(0)
23182; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23183; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23184; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
23185; GCN-NEXT:    v_max_f32_e32 v5, v5, v32
23186; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
23187; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
23188; GCN-NEXT:    s_waitcnt vmcnt(0)
23189; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23190; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23191; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
23192; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
23193; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
23194; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
23195; GCN-NEXT:    s_waitcnt vmcnt(0)
23196; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23197; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23198; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
23199; GCN-NEXT:    v_max_f32_e32 v3, v3, v32
23200; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
23201; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
23202; GCN-NEXT:    s_waitcnt vmcnt(0)
23203; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23204; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23205; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
23206; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
23207; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
23208; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
23209; GCN-NEXT:    s_waitcnt vmcnt(0)
23210; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23211; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23212; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
23213; GCN-NEXT:    v_max_f32_e32 v1, v1, v32
23214; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
23215; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
23216; GCN-NEXT:    s_waitcnt vmcnt(0)
23217; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
23218; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23219; GCN-NEXT:    v_max_f32_e32 v0, v0, v32
23220; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
23221; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
23222; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
23223; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
23224; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
23225; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
23226; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
23227; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23228; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23229; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23230; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23231; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23232; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23233; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23234; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23235; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23236; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
23237; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
23238; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
23239; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
23240; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
23241; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
23242; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23243; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23244; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23245; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23246; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23247; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23248; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23249; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23250; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23251; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
23252; GCN-NEXT:    s_setpc_b64 s[30:31]
23253;
23254; GFX7-LABEL: v_maxnum_v32bf16:
23255; GFX7:       ; %bb.0:
23256; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23257; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
23258; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
23259; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
23260; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23261; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
23262; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23263; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
23264; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23265; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
23266; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23267; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
23268; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23269; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
23270; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23271; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
23272; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23273; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
23274; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23275; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
23276; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23277; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
23278; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
23279; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
23280; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
23281; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
23282; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
23283; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
23284; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
23285; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
23286; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
23287; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
23288; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
23289; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
23290; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23291; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
23292; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23293; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
23294; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23295; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
23296; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23297; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
23298; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23299; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
23300; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23301; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
23302; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23303; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
23304; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23305; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
23306; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23307; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
23308; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
23309; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
23310; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
23311; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
23312; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
23313; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
23314; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
23315; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
23316; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
23317; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
23318; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
23319; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
23320; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
23321; GFX7-NEXT:    s_waitcnt vmcnt(1)
23322; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
23323; GFX7-NEXT:    s_waitcnt vmcnt(0)
23324; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23325; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23326; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
23327; GFX7-NEXT:    v_max_f32_e32 v31, v31, v32
23328; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
23329; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
23330; GFX7-NEXT:    s_waitcnt vmcnt(0)
23331; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23332; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23333; GFX7-NEXT:    v_max_f32_e32 v30, v30, v32
23334; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
23335; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23336; GFX7-NEXT:    s_waitcnt vmcnt(0)
23337; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23338; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23339; GFX7-NEXT:    v_max_f32_e32 v29, v29, v32
23340; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
23341; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23342; GFX7-NEXT:    s_waitcnt vmcnt(0)
23343; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23344; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23345; GFX7-NEXT:    v_max_f32_e32 v28, v28, v32
23346; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
23347; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23348; GFX7-NEXT:    s_waitcnt vmcnt(0)
23349; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23350; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23351; GFX7-NEXT:    v_max_f32_e32 v27, v27, v32
23352; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
23353; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23354; GFX7-NEXT:    s_waitcnt vmcnt(0)
23355; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23356; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23357; GFX7-NEXT:    v_max_f32_e32 v26, v26, v32
23358; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
23359; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23360; GFX7-NEXT:    s_waitcnt vmcnt(0)
23361; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23362; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23363; GFX7-NEXT:    v_max_f32_e32 v25, v25, v32
23364; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
23365; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23366; GFX7-NEXT:    s_waitcnt vmcnt(0)
23367; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23368; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23369; GFX7-NEXT:    v_max_f32_e32 v24, v24, v32
23370; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
23371; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23372; GFX7-NEXT:    s_waitcnt vmcnt(0)
23373; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23374; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23375; GFX7-NEXT:    v_max_f32_e32 v23, v23, v32
23376; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
23377; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23378; GFX7-NEXT:    s_waitcnt vmcnt(0)
23379; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23380; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23381; GFX7-NEXT:    v_max_f32_e32 v22, v22, v32
23382; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
23383; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23384; GFX7-NEXT:    s_waitcnt vmcnt(0)
23385; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23386; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23387; GFX7-NEXT:    v_max_f32_e32 v21, v21, v32
23388; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
23389; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
23390; GFX7-NEXT:    s_waitcnt vmcnt(0)
23391; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23392; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23393; GFX7-NEXT:    v_max_f32_e32 v20, v20, v32
23394; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
23395; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
23396; GFX7-NEXT:    s_waitcnt vmcnt(0)
23397; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23398; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23399; GFX7-NEXT:    v_max_f32_e32 v19, v19, v32
23400; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
23401; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
23402; GFX7-NEXT:    s_waitcnt vmcnt(0)
23403; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23404; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23405; GFX7-NEXT:    v_max_f32_e32 v18, v18, v32
23406; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
23407; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
23408; GFX7-NEXT:    s_waitcnt vmcnt(0)
23409; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23410; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23411; GFX7-NEXT:    v_max_f32_e32 v17, v17, v32
23412; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
23413; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
23414; GFX7-NEXT:    s_waitcnt vmcnt(0)
23415; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23416; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23417; GFX7-NEXT:    v_max_f32_e32 v16, v16, v32
23418; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
23419; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
23420; GFX7-NEXT:    s_waitcnt vmcnt(0)
23421; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23422; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23423; GFX7-NEXT:    v_max_f32_e32 v15, v15, v32
23424; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
23425; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23426; GFX7-NEXT:    s_waitcnt vmcnt(0)
23427; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23428; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23429; GFX7-NEXT:    v_max_f32_e32 v14, v14, v32
23430; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
23431; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23432; GFX7-NEXT:    s_waitcnt vmcnt(0)
23433; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23434; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23435; GFX7-NEXT:    v_max_f32_e32 v13, v13, v32
23436; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
23437; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23438; GFX7-NEXT:    s_waitcnt vmcnt(0)
23439; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23440; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23441; GFX7-NEXT:    v_max_f32_e32 v12, v12, v32
23442; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
23443; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23444; GFX7-NEXT:    s_waitcnt vmcnt(0)
23445; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23446; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23447; GFX7-NEXT:    v_max_f32_e32 v11, v11, v32
23448; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
23449; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23450; GFX7-NEXT:    s_waitcnt vmcnt(0)
23451; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23452; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23453; GFX7-NEXT:    v_max_f32_e32 v10, v10, v32
23454; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
23455; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23456; GFX7-NEXT:    s_waitcnt vmcnt(0)
23457; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23458; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23459; GFX7-NEXT:    v_max_f32_e32 v9, v9, v32
23460; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
23461; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23462; GFX7-NEXT:    s_waitcnt vmcnt(0)
23463; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23464; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23465; GFX7-NEXT:    v_max_f32_e32 v8, v8, v32
23466; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
23467; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23468; GFX7-NEXT:    s_waitcnt vmcnt(0)
23469; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23470; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23471; GFX7-NEXT:    v_max_f32_e32 v7, v7, v32
23472; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
23473; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23474; GFX7-NEXT:    s_waitcnt vmcnt(0)
23475; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23476; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23477; GFX7-NEXT:    v_max_f32_e32 v6, v6, v32
23478; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
23479; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
23480; GFX7-NEXT:    s_waitcnt vmcnt(0)
23481; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23482; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23483; GFX7-NEXT:    v_max_f32_e32 v5, v5, v32
23484; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
23485; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
23486; GFX7-NEXT:    s_waitcnt vmcnt(0)
23487; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23488; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23489; GFX7-NEXT:    v_max_f32_e32 v4, v4, v32
23490; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
23491; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
23492; GFX7-NEXT:    s_waitcnt vmcnt(0)
23493; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23494; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23495; GFX7-NEXT:    v_max_f32_e32 v3, v3, v32
23496; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
23497; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
23498; GFX7-NEXT:    s_waitcnt vmcnt(0)
23499; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23500; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23501; GFX7-NEXT:    v_max_f32_e32 v2, v2, v32
23502; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
23503; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
23504; GFX7-NEXT:    s_waitcnt vmcnt(0)
23505; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23506; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23507; GFX7-NEXT:    v_max_f32_e32 v1, v1, v32
23508; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
23509; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
23510; GFX7-NEXT:    s_waitcnt vmcnt(0)
23511; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
23512; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
23513; GFX7-NEXT:    v_max_f32_e32 v0, v0, v32
23514; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
23515; GFX7-NEXT:    s_setpc_b64 s[30:31]
23516;
23517; GFX8-LABEL: v_maxnum_v32bf16:
23518; GFX8:       ; %bb.0:
23519; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23520; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
23521; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
23522; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
23523; GFX8-NEXT:    v_bfe_u32 v32, v31, 16, 1
23524; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
23525; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v31
23526; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23527; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23528; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
23529; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
23530; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
23531; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
23532; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
23533; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
23534; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
23535; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
23536; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
23537; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
23538; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
23539; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
23540; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
23541; GFX8-NEXT:    v_max_f32_e32 v32, v32, v30
23542; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32
23543; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
23544; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23545; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23546; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23547; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
23548; GFX8-NEXT:    v_bfe_u32 v29, v13, 16, 1
23549; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
23550; GFX8-NEXT:    v_alignbit_b32 v14, v14, v31, 16
23551; GFX8-NEXT:    s_waitcnt vmcnt(0)
23552; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
23553; GFX8-NEXT:    v_max_f32_e32 v33, v33, v34
23554; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23555; GFX8-NEXT:    v_max_f32_e32 v30, v15, v30
23556; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
23557; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
23558; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
23559; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
23560; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
23561; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
23562; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
23563; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
23564; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23565; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
23566; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
23567; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
23568; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
23569; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
23570; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23571; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
23572; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
23573; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
23574; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
23575; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
23576; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
23577; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
23578; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
23579; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
23580; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
23581; GFX8-NEXT:    v_max_f32_e32 v29, v33, v29
23582; GFX8-NEXT:    v_bfe_u32 v33, v29, 16, 1
23583; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
23584; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23585; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23586; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23587; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
23588; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
23589; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
23590; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
23591; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
23592; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
23593; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
23594; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
23595; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
23596; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
23597; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
23598; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
23599; GFX8-NEXT:    v_max_f32_e32 v28, v33, v28
23600; GFX8-NEXT:    v_bfe_u32 v33, v28, 16, 1
23601; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
23602; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23603; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23604; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23605; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
23606; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
23607; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
23608; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
23609; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
23610; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
23611; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
23612; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
23613; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
23614; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
23615; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
23616; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
23617; GFX8-NEXT:    v_max_f32_e32 v27, v33, v27
23618; GFX8-NEXT:    v_bfe_u32 v33, v27, 16, 1
23619; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
23620; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23621; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23622; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23623; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
23624; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
23625; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
23626; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
23627; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
23628; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
23629; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
23630; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
23631; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
23632; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
23633; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
23634; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
23635; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
23636; GFX8-NEXT:    v_bfe_u32 v33, v26, 16, 1
23637; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
23638; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23639; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23640; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23641; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
23642; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
23643; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
23644; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
23645; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
23646; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
23647; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
23648; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
23649; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
23650; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
23651; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
23652; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
23653; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
23654; GFX8-NEXT:    v_bfe_u32 v33, v25, 16, 1
23655; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
23656; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23657; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23658; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23659; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
23660; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
23661; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
23662; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
23663; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
23664; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
23665; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
23666; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
23667; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
23668; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
23669; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
23670; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
23671; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
23672; GFX8-NEXT:    v_bfe_u32 v33, v24, 16, 1
23673; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
23674; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23675; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23676; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23677; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
23678; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
23679; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
23680; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
23681; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
23682; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
23683; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
23684; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
23685; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
23686; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
23687; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
23688; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
23689; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
23690; GFX8-NEXT:    v_bfe_u32 v33, v23, 16, 1
23691; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
23692; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23693; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
23694; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23695; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
23696; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
23697; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
23698; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
23699; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
23700; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
23701; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
23702; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
23703; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
23704; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
23705; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
23706; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
23707; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
23708; GFX8-NEXT:    v_bfe_u32 v33, v22, 16, 1
23709; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
23710; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
23711; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
23712; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23713; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
23714; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
23715; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
23716; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
23717; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
23718; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
23719; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
23720; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
23721; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
23722; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
23723; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
23724; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
23725; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
23726; GFX8-NEXT:    v_bfe_u32 v33, v21, 16, 1
23727; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
23728; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
23729; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
23730; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23731; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
23732; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
23733; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
23734; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
23735; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
23736; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
23737; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
23738; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
23739; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
23740; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
23741; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
23742; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
23743; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
23744; GFX8-NEXT:    v_bfe_u32 v33, v20, 16, 1
23745; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
23746; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
23747; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
23748; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23749; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
23750; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
23751; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
23752; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
23753; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
23754; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
23755; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
23756; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
23757; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
23758; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
23759; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
23760; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
23761; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
23762; GFX8-NEXT:    v_bfe_u32 v33, v19, 16, 1
23763; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
23764; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
23765; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
23766; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23767; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
23768; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
23769; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
23770; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
23771; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
23772; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
23773; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
23774; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
23775; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
23776; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
23777; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
23778; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
23779; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
23780; GFX8-NEXT:    v_bfe_u32 v33, v18, 16, 1
23781; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
23782; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
23783; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
23784; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23785; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
23786; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
23787; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
23788; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
23789; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
23790; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
23791; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
23792; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
23793; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
23794; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
23795; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
23796; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
23797; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
23798; GFX8-NEXT:    v_bfe_u32 v33, v17, 16, 1
23799; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
23800; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
23801; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
23802; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
23803; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
23804; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
23805; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
23806; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
23807; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
23808; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
23809; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
23810; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
23811; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
23812; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
23813; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
23814; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
23815; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
23816; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
23817; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
23818; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
23819; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
23820; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
23821; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
23822; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
23823; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
23824; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
23825; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
23826; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
23827; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
23828; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
23829; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
23830; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
23831; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
23832; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
23833; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
23834; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
23835; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
23836; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
23837; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
23838; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
23839; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
23840; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
23841; GFX8-NEXT:    v_alignbit_b32 v13, v13, v32, 16
23842; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
23843; GFX8-NEXT:    s_setpc_b64 s[30:31]
23844;
23845; GFX9-LABEL: v_maxnum_v32bf16:
23846; GFX9:       ; %bb.0:
23847; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23848; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
23849; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
23850; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
23851; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
23852; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
23853; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
23854; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
23855; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
23856; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
23857; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
23858; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
23859; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
23860; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
23861; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
23862; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
23863; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
23864; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
23865; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
23866; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
23867; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
23868; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23869; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
23870; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
23871; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
23872; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
23873; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
23874; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
23875; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
23876; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
23877; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
23878; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
23879; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
23880; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
23881; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
23882; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
23883; GFX9-NEXT:    v_max_f32_e32 v32, v32, v29
23884; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
23885; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
23886; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
23887; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
23888; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
23889; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
23890; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
23891; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
23892; GFX9-NEXT:    s_waitcnt vmcnt(0)
23893; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
23894; GFX9-NEXT:    v_max_f32_e32 v33, v33, v34
23895; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
23896; GFX9-NEXT:    v_max_f32_e32 v29, v15, v29
23897; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
23898; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
23899; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
23900; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
23901; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
23902; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
23903; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
23904; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
23905; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
23906; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
23907; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
23908; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
23909; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
23910; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
23911; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
23912; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
23913; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
23914; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
23915; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
23916; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
23917; GFX9-NEXT:    v_max_f32_e32 v28, v33, v28
23918; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
23919; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
23920; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
23921; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
23922; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
23923; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
23924; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
23925; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
23926; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
23927; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
23928; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
23929; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
23930; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
23931; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
23932; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
23933; GFX9-NEXT:    v_max_f32_e32 v27, v33, v27
23934; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
23935; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
23936; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
23937; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
23938; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
23939; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
23940; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
23941; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
23942; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
23943; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
23944; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
23945; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
23946; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
23947; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
23948; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
23949; GFX9-NEXT:    v_max_f32_e32 v26, v33, v26
23950; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
23951; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
23952; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
23953; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
23954; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
23955; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
23956; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
23957; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
23958; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
23959; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
23960; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
23961; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
23962; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
23963; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
23964; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
23965; GFX9-NEXT:    v_max_f32_e32 v25, v33, v25
23966; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
23967; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
23968; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
23969; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
23970; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
23971; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
23972; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
23973; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
23974; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
23975; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
23976; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
23977; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
23978; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
23979; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
23980; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
23981; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
23982; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
23983; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
23984; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
23985; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
23986; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
23987; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
23988; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
23989; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
23990; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
23991; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
23992; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
23993; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
23994; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
23995; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
23996; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
23997; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
23998; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
23999; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
24000; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
24001; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
24002; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
24003; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
24004; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
24005; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
24006; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
24007; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
24008; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
24009; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
24010; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
24011; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
24012; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
24013; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
24014; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
24015; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
24016; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
24017; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
24018; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
24019; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
24020; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
24021; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
24022; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
24023; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
24024; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
24025; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
24026; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
24027; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
24028; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
24029; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
24030; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
24031; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
24032; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
24033; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
24034; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
24035; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
24036; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
24037; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
24038; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
24039; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
24040; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
24041; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
24042; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
24043; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
24044; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
24045; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
24046; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
24047; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
24048; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
24049; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
24050; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
24051; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
24052; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
24053; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
24054; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
24055; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
24056; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
24057; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
24058; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
24059; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
24060; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
24061; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
24062; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
24063; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
24064; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
24065; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
24066; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
24067; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
24068; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
24069; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
24070; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
24071; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
24072; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
24073; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
24074; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
24075; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
24076; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
24077; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
24078; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
24079; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
24080; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
24081; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
24082; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
24083; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
24084; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
24085; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
24086; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
24087; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
24088; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
24089; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
24090; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
24091; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
24092; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
24093; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
24094; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
24095; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24096; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
24097; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
24098; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
24099; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
24100; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
24101; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
24102; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
24103; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
24104; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
24105; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24106; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
24107; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
24108; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
24109; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
24110; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
24111; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
24112; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
24113; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
24114; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
24115; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
24116; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
24117; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
24118; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
24119; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
24120; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
24121; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
24122; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
24123; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
24124; GFX9-NEXT:    s_setpc_b64 s[30:31]
24125;
24126; GFX10-LABEL: v_maxnum_v32bf16:
24127; GFX10:       ; %bb.0:
24128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24129; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
24130; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
24131; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
24132; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
24133; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
24134; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
24135; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
24136; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
24137; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
24138; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
24139; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
24140; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
24141; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
24142; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
24143; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
24144; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
24145; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
24146; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
24147; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
24148; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
24149; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
24150; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
24151; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v6
24152; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
24153; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
24154; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
24155; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v21
24156; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
24157; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
24158; GFX10-NEXT:    v_max_f32_e32 v33, v34, v33
24159; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
24160; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v24
24161; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
24162; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
24163; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
24164; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
24165; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
24166; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
24167; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
24168; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
24169; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
24170; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
24171; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
24172; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
24173; GFX10-NEXT:    v_max_f32_e32 v27, v50, v27
24174; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v0
24175; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
24176; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24177; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
24178; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
24179; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
24180; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
24181; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
24182; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
24183; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
24184; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v18
24185; GFX10-NEXT:    v_max_f32_e32 v29, v38, v29
24186; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v2
24187; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
24188; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
24189; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
24190; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
24191; GFX10-NEXT:    v_max_f32_e32 v28, v48, v28
24192; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
24193; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
24194; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
24195; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
24196; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
24197; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
24198; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
24199; GFX10-NEXT:    v_max_f32_e32 v34, v34, v51
24200; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
24201; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
24202; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
24203; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
24204; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v19
24205; GFX10-NEXT:    v_max_f32_e32 v30, v36, v30
24206; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v3
24207; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
24208; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
24209; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
24210; GFX10-NEXT:    v_max_f32_e32 v18, v48, v23
24211; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
24212; GFX10-NEXT:    v_max_f32_e32 v17, v50, v22
24213; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v33
24214; GFX10-NEXT:    v_bfe_u32 v23, v14, 16, 1
24215; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
24216; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
24217; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
24218; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
24219; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
24220; GFX10-NEXT:    v_max_f32_e32 v20, v36, v25
24221; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
24222; GFX10-NEXT:    v_max_f32_e32 v19, v38, v24
24223; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v14
24224; GFX10-NEXT:    v_bfe_u32 v25, v35, 16, 1
24225; GFX10-NEXT:    v_add3_u32 v23, v23, v14, 0x7fff
24226; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
24227; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
24228; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
24229; GFX10-NEXT:    v_max_f32_e32 v21, v51, v26
24230; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v35
24231; GFX10-NEXT:    v_bfe_u32 v36, v13, 16, 1
24232; GFX10-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
24233; GFX10-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
24234; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
24235; GFX10-NEXT:    v_or_b32_e32 v38, 0x400000, v13
24236; GFX10-NEXT:    v_bfe_u32 v48, v37, 16, 1
24237; GFX10-NEXT:    v_add3_u32 v36, v36, v13, 0x7fff
24238; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v37
24239; GFX10-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
24240; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
24241; GFX10-NEXT:    v_bfe_u32 v51, v12, 16, 1
24242; GFX10-NEXT:    v_add3_u32 v48, v48, v37, 0x7fff
24243; GFX10-NEXT:    v_or_b32_e32 v33, 0x400000, v12
24244; GFX10-NEXT:    v_bfe_u32 v22, v39, 16, 1
24245; GFX10-NEXT:    v_cndmask_b32_e32 v36, v36, v38, vcc_lo
24246; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
24247; GFX10-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
24248; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v39
24249; GFX10-NEXT:    v_bfe_u32 v24, v11, 16, 1
24250; GFX10-NEXT:    v_add3_u32 v22, v22, v39, 0x7fff
24251; GFX10-NEXT:    v_cndmask_b32_e32 v48, v48, v50, vcc_lo
24252; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
24253; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v11
24254; GFX10-NEXT:    v_bfe_u32 v26, v49, 16, 1
24255; GFX10-NEXT:    v_add3_u32 v24, v24, v11, 0x7fff
24256; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v49
24257; GFX10-NEXT:    v_cndmask_b32_e32 v33, v51, v33, vcc_lo
24258; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
24259; GFX10-NEXT:    v_bfe_u32 v38, v10, 16, 1
24260; GFX10-NEXT:    v_add3_u32 v26, v26, v49, 0x7fff
24261; GFX10-NEXT:    v_or_b32_e32 v37, 0x400000, v10
24262; GFX10-NEXT:    v_bfe_u32 v50, v34, 16, 1
24263; GFX10-NEXT:    v_cndmask_b32_e32 v14, v22, v14, vcc_lo
24264; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
24265; GFX10-NEXT:    v_add3_u32 v38, v38, v10, 0x7fff
24266; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v34
24267; GFX10-NEXT:    v_bfe_u32 v51, v9, 16, 1
24268; GFX10-NEXT:    v_add3_u32 v50, v50, v34, 0x7fff
24269; GFX10-NEXT:    v_cndmask_b32_e32 v24, v24, v35, vcc_lo
24270; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
24271; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v9
24272; GFX10-NEXT:    v_bfe_u32 v22, v30, 16, 1
24273; GFX10-NEXT:    v_add3_u32 v51, v51, v9, 0x7fff
24274; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v30
24275; GFX10-NEXT:    v_cndmask_b32_e32 v13, v26, v13, vcc_lo
24276; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
24277; GFX10-NEXT:    v_bfe_u32 v35, v8, 16, 1
24278; GFX10-NEXT:    v_add3_u32 v22, v22, v30, 0x7fff
24279; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v8
24280; GFX10-NEXT:    v_bfe_u32 v26, v29, 16, 1
24281; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
24282; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
24283; GFX10-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
24284; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v29
24285; GFX10-NEXT:    v_bfe_u32 v38, v7, 16, 1
24286; GFX10-NEXT:    v_add3_u32 v26, v26, v29, 0x7fff
24287; GFX10-NEXT:    v_cndmask_b32_e32 v12, v50, v12, vcc_lo
24288; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
24289; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v7
24290; GFX10-NEXT:    v_bfe_u32 v50, v28, 16, 1
24291; GFX10-NEXT:    v_add3_u32 v38, v38, v7, 0x7fff
24292; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v28
24293; GFX10-NEXT:    v_cndmask_b32_e32 v39, v51, v39, vcc_lo
24294; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
24295; GFX10-NEXT:    v_bfe_u32 v51, v6, 16, 1
24296; GFX10-NEXT:    v_add3_u32 v50, v50, v28, 0x7fff
24297; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v6
24298; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
24299; GFX10-NEXT:    v_cndmask_b32_e32 v11, v22, v11, vcc_lo
24300; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
24301; GFX10-NEXT:    v_bfe_u32 v22, v27, 16, 1
24302; GFX10-NEXT:    v_add3_u32 v51, v51, v6, 0x7fff
24303; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v27
24304; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
24305; GFX10-NEXT:    v_cndmask_b32_e32 v35, v35, v49, vcc_lo
24306; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
24307; GFX10-NEXT:    v_bfe_u32 v49, v5, 16, 1
24308; GFX10-NEXT:    v_add3_u32 v22, v22, v27, 0x7fff
24309; GFX10-NEXT:    v_or_b32_e32 v29, 0x400000, v5
24310; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
24311; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
24312; GFX10-NEXT:    v_bfe_u32 v26, v21, 16, 1
24313; GFX10-NEXT:    v_add3_u32 v49, v49, v5, 0x7fff
24314; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v21
24315; GFX10-NEXT:    v_cndmask_b32_e32 v34, v38, v34, vcc_lo
24316; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
24317; GFX10-NEXT:    v_bfe_u32 v38, v4, 16, 1
24318; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
24319; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v4
24320; GFX10-NEXT:    v_cndmask_b32_e32 v9, v50, v9, vcc_lo
24321; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
24322; GFX10-NEXT:    v_bfe_u32 v50, v20, 16, 1
24323; GFX10-NEXT:    v_add3_u32 v38, v38, v4, 0x7fff
24324; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v20
24325; GFX10-NEXT:    v_cndmask_b32_e32 v30, v51, v30, vcc_lo
24326; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
24327; GFX10-NEXT:    v_add3_u32 v50, v50, v20, 0x7fff
24328; GFX10-NEXT:    v_bfe_u32 v51, v3, 16, 1
24329; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v3
24330; GFX10-NEXT:    v_cndmask_b32_e32 v8, v22, v8, vcc_lo
24331; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
24332; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
24333; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v19
24334; GFX10-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
24335; GFX10-NEXT:    v_cndmask_b32_e32 v29, v49, v29, vcc_lo
24336; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
24337; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
24338; GFX10-NEXT:    v_bfe_u32 v49, v2, 16, 1
24339; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v2
24340; GFX10-NEXT:    v_cndmask_b32_e32 v7, v26, v7, vcc_lo
24341; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
24342; GFX10-NEXT:    v_bfe_u32 v26, v18, 16, 1
24343; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v18
24344; GFX10-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
24345; GFX10-NEXT:    v_cndmask_b32_e32 v28, v38, v28, vcc_lo
24346; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
24347; GFX10-NEXT:    v_bfe_u32 v38, v1, 16, 1
24348; GFX10-NEXT:    v_add3_u32 v26, v26, v18, 0x7fff
24349; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v1
24350; GFX10-NEXT:    v_cndmask_b32_e32 v6, v50, v6, vcc_lo
24351; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
24352; GFX10-NEXT:    v_bfe_u32 v50, v17, 16, 1
24353; GFX10-NEXT:    v_add3_u32 v38, v38, v1, 0x7fff
24354; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v17
24355; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v5, vcc_lo
24356; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
24357; GFX10-NEXT:    v_bfe_u32 v22, v0, 16, 1
24358; GFX10-NEXT:    v_add3_u32 v50, v50, v17, 0x7fff
24359; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v0
24360; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v4, vcc_lo
24361; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
24362; GFX10-NEXT:    v_add3_u32 v22, v22, v0, 0x7fff
24363; GFX10-NEXT:    v_cndmask_b32_e32 v1, v38, v20, vcc_lo
24364; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
24365; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
24366; GFX10-NEXT:    v_cndmask_b32_e32 v17, v50, v19, vcc_lo
24367; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24368; GFX10-NEXT:    v_perm_b32 v4, v28, v7, 0x7060302
24369; GFX10-NEXT:    v_perm_b32 v7, v34, v10, 0x7060302
24370; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v18, vcc_lo
24371; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
24372; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
24373; GFX10-NEXT:    v_cndmask_b32_e32 v2, v49, v21, vcc_lo
24374; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
24375; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
24376; GFX10-NEXT:    v_cndmask_b32_e32 v3, v51, v27, vcc_lo
24377; GFX10-NEXT:    v_perm_b32 v5, v29, v8, 0x7060302
24378; GFX10-NEXT:    v_perm_b32 v8, v35, v11, 0x7060302
24379; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
24380; GFX10-NEXT:    v_perm_b32 v6, v30, v9, 0x7060302
24381; GFX10-NEXT:    v_perm_b32 v9, v39, v12, 0x7060302
24382; GFX10-NEXT:    s_waitcnt vmcnt(0)
24383; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
24384; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
24385; GFX10-NEXT:    v_max_f32_e32 v17, v31, v17
24386; GFX10-NEXT:    v_max_f32_e32 v15, v15, v18
24387; GFX10-NEXT:    v_bfe_u32 v10, v17, 16, 1
24388; GFX10-NEXT:    v_bfe_u32 v11, v15, 16, 1
24389; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v17
24390; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
24391; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v15
24392; GFX10-NEXT:    v_add3_u32 v18, v10, v17, 0x7fff
24393; GFX10-NEXT:    v_add3_u32 v11, v11, v15, 0x7fff
24394; GFX10-NEXT:    v_perm_b32 v10, v37, v13, 0x7060302
24395; GFX10-NEXT:    v_perm_b32 v13, v36, v25, 0x7060302
24396; GFX10-NEXT:    v_cndmask_b32_e32 v17, v18, v12, vcc_lo
24397; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
24398; GFX10-NEXT:    v_perm_b32 v12, v33, v48, 0x7060302
24399; GFX10-NEXT:    v_cndmask_b32_e32 v15, v11, v19, vcc_lo
24400; GFX10-NEXT:    v_perm_b32 v11, v24, v14, 0x7060302
24401; GFX10-NEXT:    v_perm_b32 v14, v23, v16, 0x7060302
24402; GFX10-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
24403; GFX10-NEXT:    s_setpc_b64 s[30:31]
24404;
24405; GFX11-LABEL: v_maxnum_v32bf16:
24406; GFX11:       ; %bb.0:
24407; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24408; GFX11-NEXT:    scratch_load_b32 v32, off, s32
24409; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
24410; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
24411; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
24412; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
24413; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
24414; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
24415; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
24416; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
24417; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
24418; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
24419; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
24420; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24421; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
24422; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
24423; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
24424; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
24425; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
24426; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
24427; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
24428; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
24429; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
24430; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
24431; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
24432; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
24433; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
24434; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
24435; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24436; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
24437; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
24438; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
24439; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
24440; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
24441; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
24442; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
24443; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
24444; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
24445; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
24446; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
24447; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
24448; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
24449; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
24450; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
24451; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24452; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
24453; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24454; GFX11-NEXT:    v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
24455; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
24456; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
24457; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
24458; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
24459; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
24460; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
24461; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
24462; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
24463; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
24464; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
24465; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
24466; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
24467; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
24468; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
24469; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
24470; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
24471; GFX11-NEXT:    v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
24472; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
24473; GFX11-NEXT:    v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
24474; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
24475; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
24476; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
24477; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
24478; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
24479; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
24480; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
24481; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
24482; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
24483; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
24484; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
24485; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
24486; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
24487; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
24488; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
24489; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
24490; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
24491; GFX11-NEXT:    v_max_f32_e32 v20, v80, v71
24492; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
24493; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
24494; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
24495; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
24496; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24497; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
24498; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
24499; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
24500; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
24501; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
24502; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
24503; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
24504; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
24505; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
24506; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
24507; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
24508; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
24509; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
24510; GFX11-NEXT:    v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
24511; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
24512; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24513; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
24514; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
24515; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
24516; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
24517; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
24518; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
24519; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
24520; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
24521; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
24522; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
24523; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
24524; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
24525; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
24526; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
24527; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
24528; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
24529; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
24530; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
24531; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
24532; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
24533; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
24534; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
24535; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
24536; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
24537; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
24538; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
24539; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
24540; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
24541; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
24542; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
24543; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
24544; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
24545; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
24546; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
24547; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
24548; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
24549; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
24550; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
24551; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
24552; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
24553; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
24554; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
24555; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
24556; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
24557; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
24558; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
24559; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
24560; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
24561; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
24562; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
24563; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
24564; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
24565; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
24566; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
24567; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
24568; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
24569; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
24570; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
24571; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
24572; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
24573; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
24574; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
24575; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
24576; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
24577; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
24578; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
24579; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
24580; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
24581; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
24582; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
24583; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
24584; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
24585; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
24586; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
24587; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
24588; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
24589; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
24590; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
24591; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
24592; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
24593; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
24594; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
24595; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
24596; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
24597; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
24598; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
24599; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
24600; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
24601; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
24602; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
24603; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
24604; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
24605; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
24606; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
24607; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
24608; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
24609; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
24610; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
24611; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
24612; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
24613; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
24614; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
24615; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
24616; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
24617; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
24618; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
24619; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
24620; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
24621; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
24622; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
24623; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
24624; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24625; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
24626; GFX11-NEXT:    v_cndmask_b32_e32 v23, v97, v98, vcc_lo
24627; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
24628; GFX11-NEXT:    v_cndmask_b32_e32 v6, v99, v100, vcc_lo
24629; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
24630; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
24631; GFX11-NEXT:    v_cndmask_b32_e32 v22, v101, v102, vcc_lo
24632; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
24633; GFX11-NEXT:    v_cndmask_b32_e32 v5, v103, v112, vcc_lo
24634; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
24635; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24636; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
24637; GFX11-NEXT:    v_cndmask_b32_e32 v21, v113, v114, vcc_lo
24638; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
24639; GFX11-NEXT:    v_cndmask_b32_e32 v4, v115, v116, vcc_lo
24640; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
24641; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
24642; GFX11-NEXT:    v_cndmask_b32_e32 v20, v117, v118, vcc_lo
24643; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
24644; GFX11-NEXT:    v_cndmask_b32_e32 v19, v129, v130, vcc_lo
24645; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
24646; GFX11-NEXT:    v_cndmask_b32_e32 v18, v133, v134, vcc_lo
24647; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
24648; GFX11-NEXT:    v_cndmask_b32_e32 v1, v135, v144, vcc_lo
24649; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
24650; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24651; GFX11-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
24652; GFX11-NEXT:    v_cndmask_b32_e32 v17, v145, v146, vcc_lo
24653; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24654; GFX11-NEXT:    v_cndmask_b32_e32 v0, v147, v33, vcc_lo
24655; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
24656; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
24657; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
24658; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
24659; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
24660; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
24661; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
24662; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
24663; GFX11-NEXT:    s_waitcnt vmcnt(0)
24664; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
24665; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24666; GFX11-NEXT:    v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
24667; GFX11-NEXT:    v_max_f32_e32 v15, v15, v18
24668; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24669; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
24670; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
24671; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
24672; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
24673; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
24674; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
24675; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
24676; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24677; GFX11-NEXT:    v_cndmask_b32_e32 v17, v18, v20, vcc_lo
24678; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
24679; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
24680; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
24681; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
24682; GFX11-NEXT:    s_setpc_b64 s[30:31]
24683  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
24684  ret <32 x bfloat> %op
24685}
24686
24687declare bfloat @llvm.sqrt.bf16(bfloat)
24688
24689define bfloat @v_sqrt_bf16(bfloat %a) {
24690; GCN-LABEL: v_sqrt_bf16:
24691; GCN:       ; %bb.0:
24692; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24693; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24694; GCN-NEXT:    s_mov_b32 s4, 0xf800000
24695; GCN-NEXT:    v_mov_b32_e32 v1, 0x260
24696; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24697; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
24698; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
24699; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
24700; GCN-NEXT:    v_sqrt_f32_e32 v2, v0
24701; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
24702; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
24703; GCN-NEXT:    v_fma_f32 v5, -v3, v2, v0
24704; GCN-NEXT:    v_fma_f32 v6, -v4, v2, v0
24705; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
24706; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
24707; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
24708; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
24709; GCN-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
24710; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
24711; GCN-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
24712; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
24713; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24714; GCN-NEXT:    s_setpc_b64 s[30:31]
24715;
24716; GFX7-LABEL: v_sqrt_bf16:
24717; GFX7:       ; %bb.0:
24718; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24719; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24720; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24721; GFX7-NEXT:    s_mov_b32 s4, 0xf800000
24722; GFX7-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
24723; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
24724; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
24725; GFX7-NEXT:    v_sqrt_f32_e32 v1, v0
24726; GFX7-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
24727; GFX7-NEXT:    v_fma_f32 v3, -v2, v1, v0
24728; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
24729; GFX7-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24730; GFX7-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
24731; GFX7-NEXT:    v_fma_f32 v1, -v3, v1, v0
24732; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
24733; GFX7-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24734; GFX7-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
24735; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
24736; GFX7-NEXT:    v_mov_b32_e32 v2, 0x260
24737; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
24738; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
24739; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24740; GFX7-NEXT:    s_setpc_b64 s[30:31]
24741;
24742; GFX8-LABEL: v_sqrt_bf16:
24743; GFX8:       ; %bb.0:
24744; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24745; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24746; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
24747; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
24748; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
24749; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
24750; GFX8-NEXT:    v_sqrt_f32_e32 v1, v0
24751; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], -1, v1
24752; GFX8-NEXT:    v_fma_f32 v3, -v2, v1, v0
24753; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
24754; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24755; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], 1, v1
24756; GFX8-NEXT:    v_fma_f32 v1, -v3, v1, v0
24757; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
24758; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24759; GFX8-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
24760; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
24761; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
24762; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
24763; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
24764; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
24765; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
24766; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
24767; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24768; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24769; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
24770; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24771; GFX8-NEXT:    s_setpc_b64 s[30:31]
24772;
24773; GFX9-LABEL: v_sqrt_bf16:
24774; GFX9:       ; %bb.0:
24775; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24776; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24777; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
24778; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
24779; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
24780; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
24781; GFX9-NEXT:    v_sqrt_f32_e32 v1, v0
24782; GFX9-NEXT:    v_add_u32_e32 v2, -1, v1
24783; GFX9-NEXT:    v_fma_f32 v3, -v2, v1, v0
24784; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
24785; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
24786; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24787; GFX9-NEXT:    v_fma_f32 v1, -v3, v1, v0
24788; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
24789; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24790; GFX9-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
24791; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
24792; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
24793; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
24794; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
24795; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
24796; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
24797; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
24798; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24799; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24800; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
24801; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24802; GFX9-NEXT:    s_setpc_b64 s[30:31]
24803;
24804; GFX10-LABEL: v_sqrt_bf16:
24805; GFX10:       ; %bb.0:
24806; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24807; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24808; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
24809; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24810; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24811; GFX10-NEXT:    v_sqrt_f32_e32 v1, v0
24812; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v1
24813; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v1
24814; GFX10-NEXT:    v_fma_f32 v4, -v2, v1, v0
24815; GFX10-NEXT:    v_fma_f32 v5, -v3, v1, v0
24816; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, 0, v4
24817; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
24818; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
24819; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
24820; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
24821; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24822; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24823; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24824; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
24825; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24826; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24827; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
24828; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24829; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24830; GFX10-NEXT:    s_setpc_b64 s[30:31]
24831;
24832; GFX11-LABEL: v_sqrt_bf16:
24833; GFX11:       ; %bb.0:
24834; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24835; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24836; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24837; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
24838; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24839; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24840; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
24841; GFX11-NEXT:    v_sqrt_f32_e32 v1, v0
24842; GFX11-NEXT:    s_waitcnt_depctr 0xfff
24843; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v1
24844; GFX11-NEXT:    v_add_nc_u32_e32 v3, 1, v1
24845; GFX11-NEXT:    v_fma_f32 v4, -v2, v1, v0
24846; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24847; GFX11-NEXT:    v_fma_f32 v5, -v3, v1, v0
24848; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
24849; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
24850; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
24851; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
24852; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24853; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
24854; GFX11-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
24855; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24856; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24857; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24858; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24859; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
24860; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
24861; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24862; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24863; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
24864; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24865; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24866; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24867; GFX11-NEXT:    s_setpc_b64 s[30:31]
24868  %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
24869  ret bfloat %op
24870}
24871
24872declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
24873
24874define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
24875; GCN-LABEL: v_ldexp_bf16_i32:
24876; GCN:       ; %bb.0:
24877; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24878; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24879; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24880; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
24881; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24882; GCN-NEXT:    s_setpc_b64 s[30:31]
24883;
24884; GFX7-LABEL: v_ldexp_bf16_i32:
24885; GFX7:       ; %bb.0:
24886; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24887; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24888; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24889; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
24890; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24891; GFX7-NEXT:    s_setpc_b64 s[30:31]
24892;
24893; GFX8-LABEL: v_ldexp_bf16_i32:
24894; GFX8:       ; %bb.0:
24895; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24896; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24897; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
24898; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
24899; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
24900; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
24901; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24902; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24903; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
24904; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24905; GFX8-NEXT:    s_setpc_b64 s[30:31]
24906;
24907; GFX9-LABEL: v_ldexp_bf16_i32:
24908; GFX9:       ; %bb.0:
24909; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24910; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24911; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
24912; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
24913; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
24914; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
24915; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24916; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24917; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
24918; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24919; GFX9-NEXT:    s_setpc_b64 s[30:31]
24920;
24921; GFX10-LABEL: v_ldexp_bf16_i32:
24922; GFX10:       ; %bb.0:
24923; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24924; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24925; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
24926; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
24927; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24928; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24929; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
24930; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24931; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24932; GFX10-NEXT:    s_setpc_b64 s[30:31]
24933;
24934; GFX11-LABEL: v_ldexp_bf16_i32:
24935; GFX11:       ; %bb.0:
24936; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24937; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
24938; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24939; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
24940; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
24941; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
24942; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
24943; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
24944; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
24945; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
24947; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24948; GFX11-NEXT:    s_setpc_b64 s[30:31]
24949  %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
24950  ret bfloat %op
24951}
24952
24953declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
24954
24955define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
24956; GCN-LABEL: v_frexp_bf16_i16:
24957; GCN:       ; %bb.0:
24958; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24959; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24960; GCN-NEXT:    s_mov_b32 s4, 0x7f800000
24961; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24962; GCN-NEXT:    v_frexp_mant_f32_e32 v1, v0
24963; GCN-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
24964; GCN-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
24965; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
24966; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
24967; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24968; GCN-NEXT:    s_setpc_b64 s[30:31]
24969;
24970; GFX7-LABEL: v_frexp_bf16_i16:
24971; GFX7:       ; %bb.0:
24972; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24973; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
24974; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24975; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
24976; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v0
24977; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
24978; GFX7-NEXT:    s_setpc_b64 s[30:31]
24979;
24980; GFX8-LABEL: v_frexp_bf16_i16:
24981; GFX8:       ; %bb.0:
24982; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24983; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
24984; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v1
24985; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
24986; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
24987; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
24988; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
24989; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
24990; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
24991; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
24992; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
24993; GFX8-NEXT:    s_setpc_b64 s[30:31]
24994;
24995; GFX9-LABEL: v_frexp_bf16_i16:
24996; GFX9:       ; %bb.0:
24997; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24998; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
24999; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
25000; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
25001; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25002; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
25003; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
25004; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25005; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
25006; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25007; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
25008; GFX9-NEXT:    s_setpc_b64 s[30:31]
25009;
25010; GFX10-LABEL: v_frexp_bf16_i16:
25011; GFX10:       ; %bb.0:
25012; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25013; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
25014; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
25015; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
25016; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
25017; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
25018; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25019; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
25020; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25021; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25022; GFX10-NEXT:    s_setpc_b64 s[30:31]
25023;
25024; GFX11-LABEL: v_frexp_bf16_i16:
25025; GFX11:       ; %bb.0:
25026; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25027; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
25028; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25029; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v1
25030; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
25031; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
25032; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25033; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25034; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
25035; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25036; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
25037; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
25038; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25039; GFX11-NEXT:    s_setpc_b64 s[30:31]
25040  %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
25041  ret { bfloat, i16 } %op
25042}
25043
25044
25045declare bfloat @llvm.log.bf16(bfloat)
25046declare bfloat @llvm.log2.bf16(bfloat)
25047declare bfloat @llvm.log10.bf16(bfloat)
25048
25049define bfloat @v_log_bf16(bfloat %a) {
25050; GCN-LABEL: v_log_bf16:
25051; GCN:       ; %bb.0:
25052; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25053; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25054; GCN-NEXT:    s_mov_b32 s4, 0x800000
25055; GCN-NEXT:    s_mov_b32 s5, 0x7f800000
25056; GCN-NEXT:    v_mov_b32_e32 v1, 0x41b17218
25057; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25058; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25059; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
25060; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25061; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
25062; GCN-NEXT:    v_log_f32_e32 v0, v0
25063; GCN-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
25064; GCN-NEXT:    v_sub_f32_e32 v3, v0, v2
25065; GCN-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
25066; GCN-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
25067; GCN-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
25068; GCN-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
25069; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
25070; GCN-NEXT:    v_add_f32_e32 v3, v5, v3
25071; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
25072; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25073; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
25074; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25075; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
25076; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25077; GCN-NEXT:    s_setpc_b64 s[30:31]
25078;
25079; GFX7-LABEL: v_log_bf16:
25080; GFX7:       ; %bb.0:
25081; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25082; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25083; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25084; GFX7-NEXT:    s_mov_b32 s4, 0x800000
25085; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25086; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25087; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25088; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
25089; GFX7-NEXT:    v_log_f32_e32 v0, v0
25090; GFX7-NEXT:    s_mov_b32 s4, 0x3f317217
25091; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
25092; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
25093; GFX7-NEXT:    s_mov_b32 s4, 0x3377d1cf
25094; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
25095; GFX7-NEXT:    s_mov_b32 s4, 0x7f800000
25096; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
25097; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25098; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25099; GFX7-NEXT:    v_mov_b32_e32 v1, 0x41b17218
25100; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25101; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
25102; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25103; GFX7-NEXT:    s_setpc_b64 s[30:31]
25104;
25105; GFX8-LABEL: v_log_bf16:
25106; GFX8:       ; %bb.0:
25107; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25108; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25109; GFX8-NEXT:    s_mov_b32 s4, 0x800000
25110; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25111; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25112; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25113; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
25114; GFX8-NEXT:    v_log_f32_e32 v0, v0
25115; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
25116; GFX8-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
25117; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
25118; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v2
25119; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v2
25120; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
25121; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
25122; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
25123; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
25124; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
25125; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25126; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25127; GFX8-NEXT:    v_mov_b32_e32 v1, 0x41b17218
25128; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25129; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
25130; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25131; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25132; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25133; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25134; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25135; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25136; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25137; GFX8-NEXT:    s_setpc_b64 s[30:31]
25138;
25139; GFX9-LABEL: v_log_bf16:
25140; GFX9:       ; %bb.0:
25141; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25142; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25143; GFX9-NEXT:    s_mov_b32 s4, 0x800000
25144; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25145; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25146; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25147; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
25148; GFX9-NEXT:    v_log_f32_e32 v0, v0
25149; GFX9-NEXT:    s_mov_b32 s4, 0x3f317217
25150; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
25151; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
25152; GFX9-NEXT:    s_mov_b32 s4, 0x3377d1cf
25153; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
25154; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
25155; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
25156; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25157; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25158; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
25159; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25160; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
25161; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25162; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25163; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25164; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25165; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25166; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25167; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25168; GFX9-NEXT:    s_setpc_b64 s[30:31]
25169;
25170; GFX10-LABEL: v_log_bf16:
25171; GFX10:       ; %bb.0:
25172; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25173; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25174; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25175; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25176; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25177; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
25178; GFX10-NEXT:    v_log_f32_e32 v0, v0
25179; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
25180; GFX10-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
25181; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25182; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
25183; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25184; GFX10-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25185; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25186; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
25187; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25188; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25189; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25190; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25191; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25192; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25193; GFX10-NEXT:    s_setpc_b64 s[30:31]
25194;
25195; GFX11-LABEL: v_log_bf16:
25196; GFX11:       ; %bb.0:
25197; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25198; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25200; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25201; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25202; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25203; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25204; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
25205; GFX11-NEXT:    v_log_f32_e32 v0, v0
25206; GFX11-NEXT:    s_waitcnt_depctr 0xfff
25207; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
25208; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25209; GFX11-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
25210; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25211; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25212; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
25213; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25214; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25215; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25216; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25217; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
25218; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
25219; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25220; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25221; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25222; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25223; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25224; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
25225; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25226; GFX11-NEXT:    s_setpc_b64 s[30:31]
25227  %op = call bfloat @llvm.log.bf16(bfloat %a)
25228  ret bfloat %op
25229}
25230
25231define bfloat @v_log2_bf16(bfloat %a) {
25232; GCN-LABEL: v_log2_bf16:
25233; GCN:       ; %bb.0:
25234; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25235; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25236; GCN-NEXT:    s_mov_b32 s4, 0x800000
25237; GCN-NEXT:    v_mov_b32_e32 v1, 0x42000000
25238; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25239; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25240; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
25241; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25242; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
25243; GCN-NEXT:    v_log_f32_e32 v0, v0
25244; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25245; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
25246; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25247; GCN-NEXT:    s_setpc_b64 s[30:31]
25248;
25249; GFX7-LABEL: v_log2_bf16:
25250; GFX7:       ; %bb.0:
25251; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25252; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25253; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25254; GFX7-NEXT:    s_mov_b32 s4, 0x800000
25255; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25256; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25257; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25258; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
25259; GFX7-NEXT:    v_log_f32_e32 v0, v0
25260; GFX7-NEXT:    v_mov_b32_e32 v1, 0x42000000
25261; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25262; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
25263; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25264; GFX7-NEXT:    s_setpc_b64 s[30:31]
25265;
25266; GFX8-LABEL: v_log2_bf16:
25267; GFX8:       ; %bb.0:
25268; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25269; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25270; GFX8-NEXT:    s_mov_b32 s4, 0x800000
25271; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25272; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25273; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25274; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
25275; GFX8-NEXT:    v_log_f32_e32 v0, v0
25276; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
25277; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25278; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
25279; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25280; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25281; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25282; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25283; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25284; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25285; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25286; GFX8-NEXT:    s_setpc_b64 s[30:31]
25287;
25288; GFX9-LABEL: v_log2_bf16:
25289; GFX9:       ; %bb.0:
25290; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25291; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25292; GFX9-NEXT:    s_mov_b32 s4, 0x800000
25293; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25294; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
25295; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25296; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
25297; GFX9-NEXT:    v_log_f32_e32 v0, v0
25298; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
25299; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25300; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25301; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
25302; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25303; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25304; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25305; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25306; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25307; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25308; GFX9-NEXT:    s_setpc_b64 s[30:31]
25309;
25310; GFX10-LABEL: v_log2_bf16:
25311; GFX10:       ; %bb.0:
25312; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25313; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25314; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25315; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
25316; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25317; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25318; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
25319; GFX10-NEXT:    v_log_f32_e32 v0, v0
25320; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
25321; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25322; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25323; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25324; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25325; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25326; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25327; GFX10-NEXT:    s_setpc_b64 s[30:31]
25328;
25329; GFX11-LABEL: v_log2_bf16:
25330; GFX11:       ; %bb.0:
25331; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25332; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25333; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25334; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25335; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
25336; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25337; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25338; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25339; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
25340; GFX11-NEXT:    v_log_f32_e32 v0, v0
25341; GFX11-NEXT:    s_waitcnt_depctr 0xfff
25342; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
25343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25344; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
25345; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25346; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25347; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25348; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25349; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25350; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25351; GFX11-NEXT:    s_setpc_b64 s[30:31]
25352  %op = call bfloat @llvm.log2.bf16(bfloat %a)
25353  ret bfloat %op
25354}
25355
25356define bfloat @v_log10_bf16(bfloat %a) {
25357; GCN-LABEL: v_log10_bf16:
25358; GCN:       ; %bb.0:
25359; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25360; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25361; GCN-NEXT:    s_mov_b32 s4, 0x800000
25362; GCN-NEXT:    s_mov_b32 s5, 0x7f800000
25363; GCN-NEXT:    v_mov_b32_e32 v1, 0x411a209b
25364; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25365; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25366; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
25367; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
25368; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
25369; GCN-NEXT:    v_log_f32_e32 v0, v0
25370; GCN-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
25371; GCN-NEXT:    v_sub_f32_e32 v3, v0, v2
25372; GCN-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
25373; GCN-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
25374; GCN-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
25375; GCN-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
25376; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
25377; GCN-NEXT:    v_add_f32_e32 v3, v5, v3
25378; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
25379; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25380; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
25381; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25382; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
25383; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25384; GCN-NEXT:    s_setpc_b64 s[30:31]
25385;
25386; GFX7-LABEL: v_log10_bf16:
25387; GFX7:       ; %bb.0:
25388; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25389; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25390; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25391; GFX7-NEXT:    s_mov_b32 s4, 0x800000
25392; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25393; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25394; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25395; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
25396; GFX7-NEXT:    v_log_f32_e32 v0, v0
25397; GFX7-NEXT:    s_mov_b32 s4, 0x3e9a209a
25398; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
25399; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
25400; GFX7-NEXT:    s_mov_b32 s4, 0x3284fbcf
25401; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
25402; GFX7-NEXT:    s_mov_b32 s4, 0x7f800000
25403; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
25404; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25405; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25406; GFX7-NEXT:    v_mov_b32_e32 v1, 0x411a209b
25407; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25408; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
25409; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25410; GFX7-NEXT:    s_setpc_b64 s[30:31]
25411;
25412; GFX8-LABEL: v_log10_bf16:
25413; GFX8:       ; %bb.0:
25414; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25415; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25416; GFX8-NEXT:    s_mov_b32 s4, 0x800000
25417; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25418; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25419; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25420; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
25421; GFX8-NEXT:    v_log_f32_e32 v0, v0
25422; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
25423; GFX8-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
25424; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
25425; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v2
25426; GFX8-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v2
25427; GFX8-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
25428; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
25429; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
25430; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
25431; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
25432; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25433; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25434; GFX8-NEXT:    v_mov_b32_e32 v1, 0x411a209b
25435; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25436; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
25437; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25438; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25439; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25440; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25441; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25442; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25443; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25444; GFX8-NEXT:    s_setpc_b64 s[30:31]
25445;
25446; GFX9-LABEL: v_log10_bf16:
25447; GFX9:       ; %bb.0:
25448; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25449; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25450; GFX9-NEXT:    s_mov_b32 s4, 0x800000
25451; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25452; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
25453; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25454; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
25455; GFX9-NEXT:    v_log_f32_e32 v0, v0
25456; GFX9-NEXT:    s_mov_b32 s4, 0x3e9a209a
25457; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
25458; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
25459; GFX9-NEXT:    s_mov_b32 s4, 0x3284fbcf
25460; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
25461; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
25462; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
25463; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25464; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25465; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
25466; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25467; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
25468; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25469; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25470; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25471; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25472; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25473; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25474; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25475; GFX9-NEXT:    s_setpc_b64 s[30:31]
25476;
25477; GFX10-LABEL: v_log10_bf16:
25478; GFX10:       ; %bb.0:
25479; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25480; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25481; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25482; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25483; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25484; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
25485; GFX10-NEXT:    v_log_f32_e32 v0, v0
25486; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
25487; GFX10-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
25488; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25489; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
25490; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25491; GFX10-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25492; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25493; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
25494; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25495; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25496; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25497; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25498; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25499; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25500; GFX10-NEXT:    s_setpc_b64 s[30:31]
25501;
25502; GFX11-LABEL: v_log10_bf16:
25503; GFX11:       ; %bb.0:
25504; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25505; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25506; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25507; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25508; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25509; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
25510; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25511; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
25512; GFX11-NEXT:    v_log_f32_e32 v0, v0
25513; GFX11-NEXT:    s_waitcnt_depctr 0xfff
25514; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
25515; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25516; GFX11-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
25517; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25518; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25519; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
25520; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25521; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25522; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25523; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25524; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
25525; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
25526; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25527; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25528; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25529; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25530; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25531; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
25532; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25533; GFX11-NEXT:    s_setpc_b64 s[30:31]
25534  %op = call bfloat @llvm.log10.bf16(bfloat %a)
25535  ret bfloat %op
25536}
25537
25538declare bfloat @llvm.exp.bf16(bfloat)
25539declare bfloat @llvm.exp2.bf16(bfloat)
25540declare bfloat @llvm.exp10.bf16(bfloat)
25541
25542define bfloat @v_exp_bf16(bfloat %a) {
25543; GCN-LABEL: v_exp_bf16:
25544; GCN:       ; %bb.0:
25545; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25546; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25547; GCN-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
25548; GCN-NEXT:    s_mov_b32 s5, 0x42b17218
25549; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
25550; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25551; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v0
25552; GCN-NEXT:    v_sub_f32_e32 v3, v0, v0
25553; GCN-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
25554; GCN-NEXT:    v_rndne_f32_e32 v5, v2
25555; GCN-NEXT:    v_mul_f32_e32 v6, 0x39a3b295, v3
25556; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
25557; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
25558; GCN-NEXT:    v_add_f32_e32 v3, v3, v6
25559; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
25560; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
25561; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
25562; GCN-NEXT:    v_exp_f32_e32 v2, v2
25563; GCN-NEXT:    v_ldexp_f32_e32 v2, v2, v5
25564; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25565; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
25566; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v0
25567; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25568; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25569; GCN-NEXT:    s_setpc_b64 s[30:31]
25570;
25571; GFX7-LABEL: v_exp_bf16:
25572; GFX7:       ; %bb.0:
25573; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25574; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25575; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25576; GFX7-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
25577; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25578; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
25579; GFX7-NEXT:    s_mov_b32 s4, 0x32a5705f
25580; GFX7-NEXT:    v_rndne_f32_e32 v3, v1
25581; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
25582; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
25583; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
25584; GFX7-NEXT:    v_exp_f32_e32 v1, v1
25585; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v3
25586; GFX7-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
25587; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25588; GFX7-NEXT:    s_mov_b32 s4, 0x42b17218
25589; GFX7-NEXT:    v_ldexp_f32_e32 v1, v1, v2
25590; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25591; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25592; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25593; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25594; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25595; GFX7-NEXT:    s_setpc_b64 s[30:31]
25596;
25597; GFX8-LABEL: v_exp_bf16:
25598; GFX8:       ; %bb.0:
25599; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25600; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25601; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
25602; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
25603; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
25604; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
25605; GFX8-NEXT:    v_rndne_f32_e32 v2, v1
25606; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
25607; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
25608; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v2
25609; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
25610; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
25611; GFX8-NEXT:    v_exp_f32_e32 v1, v1
25612; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
25613; GFX8-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
25614; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25615; GFX8-NEXT:    s_mov_b32 s4, 0x42b17218
25616; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
25617; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25618; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25619; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25620; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25621; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25622; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25623; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25624; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25625; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25626; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25627; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25628; GFX8-NEXT:    s_setpc_b64 s[30:31]
25629;
25630; GFX9-LABEL: v_exp_bf16:
25631; GFX9:       ; %bb.0:
25632; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25633; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25634; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25635; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
25636; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
25637; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
25638; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
25639; GFX9-NEXT:    s_mov_b32 s4, 0x32a5705f
25640; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
25641; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
25642; GFX9-NEXT:    v_exp_f32_e32 v1, v1
25643; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
25644; GFX9-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
25645; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25646; GFX9-NEXT:    s_mov_b32 s4, 0x42b17218
25647; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
25648; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25649; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25650; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25651; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25652; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25653; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25654; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25655; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25656; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25657; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25658; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25659; GFX9-NEXT:    s_setpc_b64 s[30:31]
25660;
25661; GFX10-LABEL: v_exp_bf16:
25662; GFX10:       ; %bb.0:
25663; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25664; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25665; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25666; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25667; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
25668; GFX10-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25669; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
25670; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
25671; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
25672; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
25673; GFX10-NEXT:    v_exp_f32_e32 v1, v1
25674; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
25675; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25676; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25677; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25678; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25679; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25680; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25681; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25682; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25683; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25684; GFX10-NEXT:    s_setpc_b64 s[30:31]
25685;
25686; GFX11-LABEL: v_exp_bf16:
25687; GFX11:       ; %bb.0:
25688; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25689; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25690; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25691; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25692; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
25693; GFX11-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25694; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25695; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
25696; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
25697; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
25698; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25699; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25700; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
25701; GFX11-NEXT:    v_exp_f32_e32 v1, v1
25702; GFX11-NEXT:    s_waitcnt_depctr 0xfff
25703; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
25704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
25705; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25706; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25707; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25709; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
25710; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25711; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25712; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25714; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25715; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25716; GFX11-NEXT:    s_setpc_b64 s[30:31]
25717  %op = call bfloat @llvm.exp.bf16(bfloat %a)
25718  ret bfloat %op
25719}
25720
25721define bfloat @v_exp2_bf16(bfloat %a) {
25722; GCN-LABEL: v_exp2_bf16:
25723; GCN:       ; %bb.0:
25724; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25725; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25726; GCN-NEXT:    s_mov_b32 s4, 0xc2fc0000
25727; GCN-NEXT:    v_mov_b32_e32 v1, 0x42800000
25728; GCN-NEXT:    v_not_b32_e32 v2, 63
25729; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25730; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25731; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25732; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
25733; GCN-NEXT:    v_exp_f32_e32 v0, v0
25734; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
25735; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
25736; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25737; GCN-NEXT:    s_setpc_b64 s[30:31]
25738;
25739; GFX7-LABEL: v_exp2_bf16:
25740; GFX7:       ; %bb.0:
25741; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25742; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25743; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25744; GFX7-NEXT:    s_mov_b32 s4, 0xc2fc0000
25745; GFX7-NEXT:    v_mov_b32_e32 v1, 0x42800000
25746; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25747; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25748; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
25749; GFX7-NEXT:    v_exp_f32_e32 v0, v0
25750; GFX7-NEXT:    v_not_b32_e32 v1, 63
25751; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25752; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
25753; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25754; GFX7-NEXT:    s_setpc_b64 s[30:31]
25755;
25756; GFX8-LABEL: v_exp2_bf16:
25757; GFX8:       ; %bb.0:
25758; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25759; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25760; GFX8-NEXT:    s_mov_b32 s4, 0xc2fc0000
25761; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42800000
25762; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25763; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25764; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
25765; GFX8-NEXT:    v_exp_f32_e32 v0, v0
25766; GFX8-NEXT:    v_not_b32_e32 v1, 63
25767; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25768; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
25769; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25770; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25771; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25772; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25773; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25774; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25775; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25776; GFX8-NEXT:    s_setpc_b64 s[30:31]
25777;
25778; GFX9-LABEL: v_exp2_bf16:
25779; GFX9:       ; %bb.0:
25780; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25781; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25782; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
25783; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
25784; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
25785; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
25786; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
25787; GFX9-NEXT:    v_exp_f32_e32 v0, v0
25788; GFX9-NEXT:    v_not_b32_e32 v1, 63
25789; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25790; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25791; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
25792; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25793; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25794; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25795; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25796; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25797; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25798; GFX9-NEXT:    s_setpc_b64 s[30:31]
25799;
25800; GFX10-LABEL: v_exp2_bf16:
25801; GFX10:       ; %bb.0:
25802; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25803; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25804; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25805; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25806; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
25807; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
25808; GFX10-NEXT:    v_exp_f32_e32 v0, v0
25809; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
25810; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25811; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25812; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25813; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25814; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25815; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25816; GFX10-NEXT:    s_setpc_b64 s[30:31]
25817;
25818; GFX11-LABEL: v_exp2_bf16:
25819; GFX11:       ; %bb.0:
25820; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25821; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25822; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25823; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25824; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25825; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
25826; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
25827; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25828; GFX11-NEXT:    v_exp_f32_e32 v0, v0
25829; GFX11-NEXT:    s_waitcnt_depctr 0xfff
25830; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
25831; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
25832; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25833; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25834; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25835; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25836; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25837; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
25838; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25839; GFX11-NEXT:    s_setpc_b64 s[30:31]
25840  %op = call bfloat @llvm.exp2.bf16(bfloat %a)
25841  ret bfloat %op
25842}
25843
25844define bfloat @v_exp10_bf16(bfloat %a) {
25845; GCN-LABEL: v_exp10_bf16:
25846; GCN:       ; %bb.0:
25847; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25848; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25849; GCN-NEXT:    s_mov_b32 s4, 0xc23369f4
25850; GCN-NEXT:    s_mov_b32 s5, 0x421a209b
25851; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
25852; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25853; GCN-NEXT:    v_mul_f32_e32 v2, 0x40549000, v0
25854; GCN-NEXT:    v_sub_f32_e32 v3, v0, v0
25855; GCN-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
25856; GCN-NEXT:    v_rndne_f32_e32 v5, v2
25857; GCN-NEXT:    v_mul_f32_e32 v6, 0x3a2784bc, v3
25858; GCN-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
25859; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
25860; GCN-NEXT:    v_add_f32_e32 v3, v3, v6
25861; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
25862; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
25863; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
25864; GCN-NEXT:    v_exp_f32_e32 v2, v2
25865; GCN-NEXT:    v_ldexp_f32_e32 v2, v2, v5
25866; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25867; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
25868; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v0
25869; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25870; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25871; GCN-NEXT:    s_setpc_b64 s[30:31]
25872;
25873; GFX7-LABEL: v_exp10_bf16:
25874; GFX7:       ; %bb.0:
25875; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25876; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
25877; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25878; GFX7-NEXT:    s_mov_b32 s4, 0x40549a78
25879; GFX7-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
25880; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
25881; GFX7-NEXT:    s_mov_b32 s4, 0x33979a37
25882; GFX7-NEXT:    v_rndne_f32_e32 v3, v1
25883; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
25884; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
25885; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
25886; GFX7-NEXT:    v_exp_f32_e32 v1, v1
25887; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v3
25888; GFX7-NEXT:    s_mov_b32 s4, 0xc23369f4
25889; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25890; GFX7-NEXT:    s_mov_b32 s4, 0x421a209b
25891; GFX7-NEXT:    v_ldexp_f32_e32 v1, v1, v2
25892; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25893; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25894; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25895; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25896; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
25897; GFX7-NEXT:    s_setpc_b64 s[30:31]
25898;
25899; GFX8-LABEL: v_exp10_bf16:
25900; GFX8:       ; %bb.0:
25901; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25902; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25903; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
25904; GFX8-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
25905; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
25906; GFX8-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
25907; GFX8-NEXT:    v_rndne_f32_e32 v2, v1
25908; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
25909; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
25910; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v2
25911; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
25912; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
25913; GFX8-NEXT:    v_exp_f32_e32 v1, v1
25914; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
25915; GFX8-NEXT:    s_mov_b32 s4, 0xc23369f4
25916; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25917; GFX8-NEXT:    s_mov_b32 s4, 0x421a209b
25918; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
25919; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25920; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25921; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25922; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25923; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
25924; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
25925; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
25926; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25927; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25928; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25929; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25930; GFX8-NEXT:    s_setpc_b64 s[30:31]
25931;
25932; GFX9-LABEL: v_exp10_bf16:
25933; GFX9:       ; %bb.0:
25934; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25935; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25936; GFX9-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
25937; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
25938; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
25939; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
25940; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
25941; GFX9-NEXT:    s_mov_b32 s4, 0x33979a37
25942; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
25943; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
25944; GFX9-NEXT:    v_exp_f32_e32 v1, v1
25945; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
25946; GFX9-NEXT:    s_mov_b32 s4, 0xc23369f4
25947; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
25948; GFX9-NEXT:    s_mov_b32 s4, 0x421a209b
25949; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
25950; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
25951; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
25952; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
25953; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
25954; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
25955; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
25956; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
25957; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25958; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
25959; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
25960; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25961; GFX9-NEXT:    s_setpc_b64 s[30:31]
25962;
25963; GFX10-LABEL: v_exp10_bf16:
25964; GFX10:       ; %bb.0:
25965; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25966; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25967; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
25968; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
25969; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
25970; GFX10-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
25971; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
25972; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
25973; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
25974; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
25975; GFX10-NEXT:    v_exp_f32_e32 v1, v1
25976; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
25977; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25978; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
25979; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25980; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
25981; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
25982; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
25983; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
25984; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25985; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
25986; GFX10-NEXT:    s_setpc_b64 s[30:31]
25987;
25988; GFX11-LABEL: v_exp10_bf16:
25989; GFX11:       ; %bb.0:
25990; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25991; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
25992; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25993; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
25994; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
25995; GFX11-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
25996; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25997; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
25998; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
25999; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
26000; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
26001; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26002; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
26003; GFX11-NEXT:    v_exp_f32_e32 v1, v1
26004; GFX11-NEXT:    s_waitcnt_depctr 0xfff
26005; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
26006; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
26007; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
26008; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
26009; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
26010; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26011; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26012; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26013; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26014; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26015; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26016; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26017; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26018; GFX11-NEXT:    s_setpc_b64 s[30:31]
26019  %op = call bfloat @llvm.exp10.bf16(bfloat %a)
26020  ret bfloat %op
26021}
26022
26023declare bfloat @llvm.ceil.bf16(bfloat)
26024
26025define bfloat @v_ceil_bf16(bfloat %a) {
26026; GCN-LABEL: v_ceil_bf16:
26027; GCN:       ; %bb.0:
26028; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26029; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26030; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26031; GCN-NEXT:    v_ceil_f32_e32 v0, v0
26032; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26033; GCN-NEXT:    s_setpc_b64 s[30:31]
26034;
26035; GFX7-LABEL: v_ceil_bf16:
26036; GFX7:       ; %bb.0:
26037; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26038; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26039; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26040; GFX7-NEXT:    v_ceil_f32_e32 v0, v0
26041; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26042; GFX7-NEXT:    s_setpc_b64 s[30:31]
26043;
26044; GFX8-LABEL: v_ceil_bf16:
26045; GFX8:       ; %bb.0:
26046; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26047; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26048; GFX8-NEXT:    v_ceil_f32_e32 v0, v0
26049; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26050; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26051; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26052; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26053; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26054; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26055; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26056; GFX8-NEXT:    s_setpc_b64 s[30:31]
26057;
26058; GFX9-LABEL: v_ceil_bf16:
26059; GFX9:       ; %bb.0:
26060; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26061; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26062; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
26063; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26064; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26065; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26066; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26067; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26068; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26069; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26070; GFX9-NEXT:    s_setpc_b64 s[30:31]
26071;
26072; GFX10-LABEL: v_ceil_bf16:
26073; GFX10:       ; %bb.0:
26074; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26075; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26076; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
26077; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26078; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26079; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26080; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26081; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26082; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26083; GFX10-NEXT:    s_setpc_b64 s[30:31]
26084;
26085; GFX11-LABEL: v_ceil_bf16:
26086; GFX11:       ; %bb.0:
26087; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26088; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26089; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26090; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
26091; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26092; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26093; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26094; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26095; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26096; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26097; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26098; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26099; GFX11-NEXT:    s_setpc_b64 s[30:31]
26100  %op = call bfloat @llvm.ceil.bf16(bfloat %a)
26101  ret bfloat %op
26102}
26103
26104declare bfloat @llvm.trunc.bf16(bfloat)
26105
26106define bfloat @v_trunc_bf16(bfloat %a) {
26107; GCN-LABEL: v_trunc_bf16:
26108; GCN:       ; %bb.0:
26109; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26110; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26111; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26112; GCN-NEXT:    v_trunc_f32_e32 v0, v0
26113; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26114; GCN-NEXT:    s_setpc_b64 s[30:31]
26115;
26116; GFX7-LABEL: v_trunc_bf16:
26117; GFX7:       ; %bb.0:
26118; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26119; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26120; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26121; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
26122; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26123; GFX7-NEXT:    s_setpc_b64 s[30:31]
26124;
26125; GFX8-LABEL: v_trunc_bf16:
26126; GFX8:       ; %bb.0:
26127; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26128; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26129; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
26130; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26131; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26132; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26133; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26134; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26135; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26136; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26137; GFX8-NEXT:    s_setpc_b64 s[30:31]
26138;
26139; GFX9-LABEL: v_trunc_bf16:
26140; GFX9:       ; %bb.0:
26141; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26142; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26143; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
26144; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26145; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26146; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26147; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26148; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26149; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26150; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26151; GFX9-NEXT:    s_setpc_b64 s[30:31]
26152;
26153; GFX10-LABEL: v_trunc_bf16:
26154; GFX10:       ; %bb.0:
26155; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26156; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26157; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
26158; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26159; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26160; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26161; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26162; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26163; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26164; GFX10-NEXT:    s_setpc_b64 s[30:31]
26165;
26166; GFX11-LABEL: v_trunc_bf16:
26167; GFX11:       ; %bb.0:
26168; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26169; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26171; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
26172; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26173; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26174; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26175; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26176; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26177; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26178; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26179; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26180; GFX11-NEXT:    s_setpc_b64 s[30:31]
26181  %op = call bfloat @llvm.trunc.bf16(bfloat %a)
26182  ret bfloat %op
26183}
26184
26185declare bfloat @llvm.rint.bf16(bfloat)
26186
26187define bfloat @v_rint_bf16(bfloat %a) {
26188; GCN-LABEL: v_rint_bf16:
26189; GCN:       ; %bb.0:
26190; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26191; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26192; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26193; GCN-NEXT:    v_rndne_f32_e32 v0, v0
26194; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26195; GCN-NEXT:    s_setpc_b64 s[30:31]
26196;
26197; GFX7-LABEL: v_rint_bf16:
26198; GFX7:       ; %bb.0:
26199; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26200; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26201; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26202; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
26203; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26204; GFX7-NEXT:    s_setpc_b64 s[30:31]
26205;
26206; GFX8-LABEL: v_rint_bf16:
26207; GFX8:       ; %bb.0:
26208; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26209; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26210; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
26211; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26212; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26213; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26214; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26215; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26216; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26217; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26218; GFX8-NEXT:    s_setpc_b64 s[30:31]
26219;
26220; GFX9-LABEL: v_rint_bf16:
26221; GFX9:       ; %bb.0:
26222; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26223; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26224; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
26225; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26226; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26227; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26228; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26229; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26230; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26231; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26232; GFX9-NEXT:    s_setpc_b64 s[30:31]
26233;
26234; GFX10-LABEL: v_rint_bf16:
26235; GFX10:       ; %bb.0:
26236; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26237; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26238; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
26239; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26240; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26241; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26242; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26243; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26244; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26245; GFX10-NEXT:    s_setpc_b64 s[30:31]
26246;
26247; GFX11-LABEL: v_rint_bf16:
26248; GFX11:       ; %bb.0:
26249; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26250; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26251; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26252; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
26253; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26254; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26255; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26256; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26257; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26258; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26259; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26260; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26261; GFX11-NEXT:    s_setpc_b64 s[30:31]
26262  %op = call bfloat @llvm.rint.bf16(bfloat %a)
26263  ret bfloat %op
26264}
26265
26266declare bfloat @llvm.nearbyint.bf16(bfloat)
26267
26268define bfloat @v_nearbyint_bf16(bfloat %a) {
26269; GCN-LABEL: v_nearbyint_bf16:
26270; GCN:       ; %bb.0:
26271; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26272; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26273; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26274; GCN-NEXT:    v_rndne_f32_e32 v0, v0
26275; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26276; GCN-NEXT:    s_setpc_b64 s[30:31]
26277;
26278; GFX7-LABEL: v_nearbyint_bf16:
26279; GFX7:       ; %bb.0:
26280; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26281; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26282; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26283; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
26284; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26285; GFX7-NEXT:    s_setpc_b64 s[30:31]
26286;
26287; GFX8-LABEL: v_nearbyint_bf16:
26288; GFX8:       ; %bb.0:
26289; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26290; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26291; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
26292; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26293; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26294; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26295; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26296; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26297; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26298; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26299; GFX8-NEXT:    s_setpc_b64 s[30:31]
26300;
26301; GFX9-LABEL: v_nearbyint_bf16:
26302; GFX9:       ; %bb.0:
26303; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26304; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26305; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
26306; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26307; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26308; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26309; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26310; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26311; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26312; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26313; GFX9-NEXT:    s_setpc_b64 s[30:31]
26314;
26315; GFX10-LABEL: v_nearbyint_bf16:
26316; GFX10:       ; %bb.0:
26317; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26318; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26319; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
26320; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26321; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26322; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26323; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26324; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26325; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26326; GFX10-NEXT:    s_setpc_b64 s[30:31]
26327;
26328; GFX11-LABEL: v_nearbyint_bf16:
26329; GFX11:       ; %bb.0:
26330; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26331; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26332; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26333; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
26334; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26335; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26336; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26337; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26338; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26339; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26341; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26342; GFX11-NEXT:    s_setpc_b64 s[30:31]
26343  %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
26344  ret bfloat %op
26345}
26346
26347declare bfloat @llvm.round.bf16(bfloat)
26348
26349define bfloat @v_round_bf16(bfloat %a) {
26350; GCN-LABEL: v_round_bf16:
26351; GCN:       ; %bb.0:
26352; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26353; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26354; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26355; GCN-NEXT:    v_trunc_f32_e32 v1, v0
26356; GCN-NEXT:    v_sub_f32_e32 v2, v0, v1
26357; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26358; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26359; GCN-NEXT:    s_brev_b32 s4, -2
26360; GCN-NEXT:    v_bfi_b32 v0, s4, v2, v0
26361; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
26362; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26363; GCN-NEXT:    s_setpc_b64 s[30:31]
26364;
26365; GFX7-LABEL: v_round_bf16:
26366; GFX7:       ; %bb.0:
26367; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26368; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26369; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26370; GFX7-NEXT:    v_trunc_f32_e32 v1, v0
26371; GFX7-NEXT:    v_sub_f32_e32 v2, v0, v1
26372; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26373; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26374; GFX7-NEXT:    s_brev_b32 s4, -2
26375; GFX7-NEXT:    v_bfi_b32 v0, s4, v2, v0
26376; GFX7-NEXT:    v_add_f32_e32 v0, v1, v0
26377; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26378; GFX7-NEXT:    s_setpc_b64 s[30:31]
26379;
26380; GFX8-LABEL: v_round_bf16:
26381; GFX8:       ; %bb.0:
26382; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26383; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26384; GFX8-NEXT:    v_trunc_f32_e32 v1, v0
26385; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
26386; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26387; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26388; GFX8-NEXT:    s_brev_b32 s4, -2
26389; GFX8-NEXT:    v_bfi_b32 v0, s4, v2, v0
26390; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
26391; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26392; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26393; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26394; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26395; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26396; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26397; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26398; GFX8-NEXT:    s_setpc_b64 s[30:31]
26399;
26400; GFX9-LABEL: v_round_bf16:
26401; GFX9:       ; %bb.0:
26402; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26403; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26404; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
26405; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
26406; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26407; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26408; GFX9-NEXT:    s_brev_b32 s4, -2
26409; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
26410; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
26411; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26412; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26413; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26414; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26415; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26416; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26417; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26418; GFX9-NEXT:    s_setpc_b64 s[30:31]
26419;
26420; GFX10-LABEL: v_round_bf16:
26421; GFX10:       ; %bb.0:
26422; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26423; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26424; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
26425; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
26426; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
26427; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
26428; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
26429; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
26430; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26431; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26432; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26433; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26434; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26435; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26436; GFX10-NEXT:    s_setpc_b64 s[30:31]
26437;
26438; GFX11-LABEL: v_round_bf16:
26439; GFX11:       ; %bb.0:
26440; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26441; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26442; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26443; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
26444; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
26445; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26446; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
26447; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
26448; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26449; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
26450; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
26451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26452; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26453; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26454; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26455; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26456; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26457; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26458; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26459; GFX11-NEXT:    s_setpc_b64 s[30:31]
26460  %op = call bfloat @llvm.round.bf16(bfloat %a)
26461  ret bfloat %op
26462}
26463
26464declare bfloat @llvm.roundeven.bf16(bfloat)
26465
26466define bfloat @v_roundeven_bf16(bfloat %a) {
26467; GCN-LABEL: v_roundeven_bf16:
26468; GCN:       ; %bb.0:
26469; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26470; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26471; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26472; GCN-NEXT:    v_rndne_f32_e32 v0, v0
26473; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26474; GCN-NEXT:    s_setpc_b64 s[30:31]
26475;
26476; GFX7-LABEL: v_roundeven_bf16:
26477; GFX7:       ; %bb.0:
26478; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26479; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26480; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26481; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
26482; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26483; GFX7-NEXT:    s_setpc_b64 s[30:31]
26484;
26485; GFX8-LABEL: v_roundeven_bf16:
26486; GFX8:       ; %bb.0:
26487; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26488; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26489; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
26490; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26491; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26492; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26493; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26494; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26495; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26496; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26497; GFX8-NEXT:    s_setpc_b64 s[30:31]
26498;
26499; GFX9-LABEL: v_roundeven_bf16:
26500; GFX9:       ; %bb.0:
26501; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26502; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26503; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
26504; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26505; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26506; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26507; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26508; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26509; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26510; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26511; GFX9-NEXT:    s_setpc_b64 s[30:31]
26512;
26513; GFX10-LABEL: v_roundeven_bf16:
26514; GFX10:       ; %bb.0:
26515; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26516; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26517; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
26518; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26519; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26520; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26521; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26522; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26523; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26524; GFX10-NEXT:    s_setpc_b64 s[30:31]
26525;
26526; GFX11-LABEL: v_roundeven_bf16:
26527; GFX11:       ; %bb.0:
26528; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26529; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26530; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26531; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
26532; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26533; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26534; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26535; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26536; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26537; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26538; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26539; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26540; GFX11-NEXT:    s_setpc_b64 s[30:31]
26541  %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
26542  ret bfloat %op
26543}
26544
26545declare bfloat @llvm.floor.bf16(bfloat)
26546
26547define bfloat @v_floor_bf16(bfloat %a) {
26548; GCN-LABEL: v_floor_bf16:
26549; GCN:       ; %bb.0:
26550; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26551; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26552; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26553; GCN-NEXT:    v_floor_f32_e32 v0, v0
26554; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26555; GCN-NEXT:    s_setpc_b64 s[30:31]
26556;
26557; GFX7-LABEL: v_floor_bf16:
26558; GFX7:       ; %bb.0:
26559; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26560; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26561; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26562; GFX7-NEXT:    v_floor_f32_e32 v0, v0
26563; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26564; GFX7-NEXT:    s_setpc_b64 s[30:31]
26565;
26566; GFX8-LABEL: v_floor_bf16:
26567; GFX8:       ; %bb.0:
26568; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26569; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26570; GFX8-NEXT:    v_floor_f32_e32 v0, v0
26571; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26572; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26573; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26574; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26575; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26576; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26577; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26578; GFX8-NEXT:    s_setpc_b64 s[30:31]
26579;
26580; GFX9-LABEL: v_floor_bf16:
26581; GFX9:       ; %bb.0:
26582; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26583; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26584; GFX9-NEXT:    v_floor_f32_e32 v0, v0
26585; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26586; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26587; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26588; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26589; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26590; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26591; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26592; GFX9-NEXT:    s_setpc_b64 s[30:31]
26593;
26594; GFX10-LABEL: v_floor_bf16:
26595; GFX10:       ; %bb.0:
26596; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26597; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26598; GFX10-NEXT:    v_floor_f32_e32 v0, v0
26599; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26600; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26601; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26602; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26603; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26604; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26605; GFX10-NEXT:    s_setpc_b64 s[30:31]
26606;
26607; GFX11-LABEL: v_floor_bf16:
26608; GFX11:       ; %bb.0:
26609; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26610; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26611; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26612; GFX11-NEXT:    v_floor_f32_e32 v0, v0
26613; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26614; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26615; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26616; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26617; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26618; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26619; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26620; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26621; GFX11-NEXT:    s_setpc_b64 s[30:31]
26622  %op = call bfloat @llvm.floor.bf16(bfloat %a)
26623  ret bfloat %op
26624}
26625
26626declare bfloat @llvm.canonicalize.bf16(bfloat)
26627
26628define bfloat @v_canonicalize_bf16(bfloat %a) {
26629; GCN-LABEL: v_canonicalize_bf16:
26630; GCN:       ; %bb.0:
26631; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26632; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26633; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26634; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26635; GCN-NEXT:    s_setpc_b64 s[30:31]
26636;
26637; GFX7-LABEL: v_canonicalize_bf16:
26638; GFX7:       ; %bb.0:
26639; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26640; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26641; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26642; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26643; GFX7-NEXT:    s_setpc_b64 s[30:31]
26644;
26645; GFX8-LABEL: v_canonicalize_bf16:
26646; GFX8:       ; %bb.0:
26647; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26648; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26649; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26650; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
26651; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
26652; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
26653; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26654; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26655; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26656; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26657; GFX8-NEXT:    s_setpc_b64 s[30:31]
26658;
26659; GFX9-LABEL: v_canonicalize_bf16:
26660; GFX9:       ; %bb.0:
26661; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26662; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26663; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
26664; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
26665; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
26666; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
26667; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26668; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
26669; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
26670; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26671; GFX9-NEXT:    s_setpc_b64 s[30:31]
26672;
26673; GFX10-LABEL: v_canonicalize_bf16:
26674; GFX10:       ; %bb.0:
26675; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26676; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26677; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
26678; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
26679; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26680; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26681; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26682; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26683; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26684; GFX10-NEXT:    s_setpc_b64 s[30:31]
26685;
26686; GFX11-LABEL: v_canonicalize_bf16:
26687; GFX11:       ; %bb.0:
26688; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26689; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26690; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26691; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
26692; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
26693; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
26694; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
26695; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26696; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
26697; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26698; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26699; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
26700; GFX11-NEXT:    s_setpc_b64 s[30:31]
26701  %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
26702  ret bfloat %op
26703}
26704
26705declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
26706
26707; FIXME: Promotion broken
26708; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
26709;   %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
26710;   ret bfloat %op
26711; }
26712
26713define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
26714; GCN-LABEL: v_fcmp_false_bf16:
26715; GCN:       ; %bb.0:
26716; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26717; GCN-NEXT:    v_mov_b32_e32 v0, 0
26718; GCN-NEXT:    s_setpc_b64 s[30:31]
26719;
26720; GFX7-LABEL: v_fcmp_false_bf16:
26721; GFX7:       ; %bb.0:
26722; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26723; GFX7-NEXT:    v_mov_b32_e32 v0, 0
26724; GFX7-NEXT:    s_setpc_b64 s[30:31]
26725;
26726; GFX8-LABEL: v_fcmp_false_bf16:
26727; GFX8:       ; %bb.0:
26728; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26729; GFX8-NEXT:    v_mov_b32_e32 v0, 0
26730; GFX8-NEXT:    s_setpc_b64 s[30:31]
26731;
26732; GFX9-LABEL: v_fcmp_false_bf16:
26733; GFX9:       ; %bb.0:
26734; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26735; GFX9-NEXT:    v_mov_b32_e32 v0, 0
26736; GFX9-NEXT:    s_setpc_b64 s[30:31]
26737;
26738; GFX10-LABEL: v_fcmp_false_bf16:
26739; GFX10:       ; %bb.0:
26740; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26741; GFX10-NEXT:    v_mov_b32_e32 v0, 0
26742; GFX10-NEXT:    s_setpc_b64 s[30:31]
26743;
26744; GFX11-LABEL: v_fcmp_false_bf16:
26745; GFX11:       ; %bb.0:
26746; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26747; GFX11-NEXT:    v_mov_b32_e32 v0, 0
26748; GFX11-NEXT:    s_setpc_b64 s[30:31]
26749  %op = fcmp false bfloat %a, %b
26750  ret i1 %op
26751}
26752
26753define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
26754; GCN-LABEL: v_fcmp_oeq_bf16:
26755; GCN:       ; %bb.0:
26756; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26757; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26758; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26759; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26760; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26761; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
26762; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26763; GCN-NEXT:    s_setpc_b64 s[30:31]
26764;
26765; GFX7-LABEL: v_fcmp_oeq_bf16:
26766; GFX7:       ; %bb.0:
26767; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26768; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26769; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26770; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26771; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26772; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
26773; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26774; GFX7-NEXT:    s_setpc_b64 s[30:31]
26775;
26776; GFX8-LABEL: v_fcmp_oeq_bf16:
26777; GFX8:       ; %bb.0:
26778; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26779; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26780; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26781; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
26782; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26783; GFX8-NEXT:    s_setpc_b64 s[30:31]
26784;
26785; GFX9-LABEL: v_fcmp_oeq_bf16:
26786; GFX9:       ; %bb.0:
26787; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26788; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26789; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26790; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
26791; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26792; GFX9-NEXT:    s_setpc_b64 s[30:31]
26793;
26794; GFX10-LABEL: v_fcmp_oeq_bf16:
26795; GFX10:       ; %bb.0:
26796; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26797; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26798; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26799; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
26800; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26801; GFX10-NEXT:    s_setpc_b64 s[30:31]
26802;
26803; GFX11-LABEL: v_fcmp_oeq_bf16:
26804; GFX11:       ; %bb.0:
26805; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26806; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26807; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26808; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26809; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
26810; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26811; GFX11-NEXT:    s_setpc_b64 s[30:31]
26812  %op = fcmp oeq bfloat %a, %b
26813  ret i1 %op
26814}
26815
26816define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
26817; GCN-LABEL: v_fcmp_ogt_bf16:
26818; GCN:       ; %bb.0:
26819; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26820; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26821; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26822; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26823; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26824; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
26825; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26826; GCN-NEXT:    s_setpc_b64 s[30:31]
26827;
26828; GFX7-LABEL: v_fcmp_ogt_bf16:
26829; GFX7:       ; %bb.0:
26830; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26831; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26832; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26833; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26834; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26835; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
26836; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26837; GFX7-NEXT:    s_setpc_b64 s[30:31]
26838;
26839; GFX8-LABEL: v_fcmp_ogt_bf16:
26840; GFX8:       ; %bb.0:
26841; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26842; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26843; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26844; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
26845; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26846; GFX8-NEXT:    s_setpc_b64 s[30:31]
26847;
26848; GFX9-LABEL: v_fcmp_ogt_bf16:
26849; GFX9:       ; %bb.0:
26850; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26851; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26852; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26853; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
26854; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26855; GFX9-NEXT:    s_setpc_b64 s[30:31]
26856;
26857; GFX10-LABEL: v_fcmp_ogt_bf16:
26858; GFX10:       ; %bb.0:
26859; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26860; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26861; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26862; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
26863; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26864; GFX10-NEXT:    s_setpc_b64 s[30:31]
26865;
26866; GFX11-LABEL: v_fcmp_ogt_bf16:
26867; GFX11:       ; %bb.0:
26868; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26869; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26870; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26871; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26872; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
26873; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26874; GFX11-NEXT:    s_setpc_b64 s[30:31]
26875  %op = fcmp ogt bfloat %a, %b
26876  ret i1 %op
26877}
26878
26879define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
26880; GCN-LABEL: v_fcmp_oge_bf16:
26881; GCN:       ; %bb.0:
26882; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26883; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26884; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26885; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26886; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26887; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
26888; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26889; GCN-NEXT:    s_setpc_b64 s[30:31]
26890;
26891; GFX7-LABEL: v_fcmp_oge_bf16:
26892; GFX7:       ; %bb.0:
26893; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26894; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26895; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26896; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26897; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26898; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
26899; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26900; GFX7-NEXT:    s_setpc_b64 s[30:31]
26901;
26902; GFX8-LABEL: v_fcmp_oge_bf16:
26903; GFX8:       ; %bb.0:
26904; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26905; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26906; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26907; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
26908; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26909; GFX8-NEXT:    s_setpc_b64 s[30:31]
26910;
26911; GFX9-LABEL: v_fcmp_oge_bf16:
26912; GFX9:       ; %bb.0:
26913; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26914; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26915; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26916; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
26917; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26918; GFX9-NEXT:    s_setpc_b64 s[30:31]
26919;
26920; GFX10-LABEL: v_fcmp_oge_bf16:
26921; GFX10:       ; %bb.0:
26922; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26923; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26924; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26925; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
26926; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26927; GFX10-NEXT:    s_setpc_b64 s[30:31]
26928;
26929; GFX11-LABEL: v_fcmp_oge_bf16:
26930; GFX11:       ; %bb.0:
26931; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26932; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26933; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26934; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26935; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
26936; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26937; GFX11-NEXT:    s_setpc_b64 s[30:31]
26938  %op = fcmp oge bfloat %a, %b
26939  ret i1 %op
26940}
26941
26942define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
26943; GCN-LABEL: v_fcmp_olt_bf16:
26944; GCN:       ; %bb.0:
26945; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26946; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26947; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26948; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26949; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26950; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
26951; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26952; GCN-NEXT:    s_setpc_b64 s[30:31]
26953;
26954; GFX7-LABEL: v_fcmp_olt_bf16:
26955; GFX7:       ; %bb.0:
26956; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26957; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
26958; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
26959; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
26960; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
26961; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
26962; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26963; GFX7-NEXT:    s_setpc_b64 s[30:31]
26964;
26965; GFX8-LABEL: v_fcmp_olt_bf16:
26966; GFX8:       ; %bb.0:
26967; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26968; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26969; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26970; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
26971; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26972; GFX8-NEXT:    s_setpc_b64 s[30:31]
26973;
26974; GFX9-LABEL: v_fcmp_olt_bf16:
26975; GFX9:       ; %bb.0:
26976; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26977; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26978; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26979; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
26980; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
26981; GFX9-NEXT:    s_setpc_b64 s[30:31]
26982;
26983; GFX10-LABEL: v_fcmp_olt_bf16:
26984; GFX10:       ; %bb.0:
26985; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26986; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26987; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26988; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
26989; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26990; GFX10-NEXT:    s_setpc_b64 s[30:31]
26991;
26992; GFX11-LABEL: v_fcmp_olt_bf16:
26993; GFX11:       ; %bb.0:
26994; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26995; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
26996; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
26997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
26998; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
26999; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27000; GFX11-NEXT:    s_setpc_b64 s[30:31]
27001  %op = fcmp olt bfloat %a, %b
27002  ret i1 %op
27003}
27004
27005define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
27006; GCN-LABEL: v_fcmp_ole_bf16:
27007; GCN:       ; %bb.0:
27008; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27009; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27010; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27011; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27012; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27013; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
27014; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27015; GCN-NEXT:    s_setpc_b64 s[30:31]
27016;
27017; GFX7-LABEL: v_fcmp_ole_bf16:
27018; GFX7:       ; %bb.0:
27019; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27020; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27021; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27022; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27023; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27024; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
27025; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27026; GFX7-NEXT:    s_setpc_b64 s[30:31]
27027;
27028; GFX8-LABEL: v_fcmp_ole_bf16:
27029; GFX8:       ; %bb.0:
27030; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27031; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27032; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27033; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
27034; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27035; GFX8-NEXT:    s_setpc_b64 s[30:31]
27036;
27037; GFX9-LABEL: v_fcmp_ole_bf16:
27038; GFX9:       ; %bb.0:
27039; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27040; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27041; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27042; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
27043; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27044; GFX9-NEXT:    s_setpc_b64 s[30:31]
27045;
27046; GFX10-LABEL: v_fcmp_ole_bf16:
27047; GFX10:       ; %bb.0:
27048; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27049; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27050; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27051; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
27052; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27053; GFX10-NEXT:    s_setpc_b64 s[30:31]
27054;
27055; GFX11-LABEL: v_fcmp_ole_bf16:
27056; GFX11:       ; %bb.0:
27057; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27058; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27059; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27060; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27061; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
27062; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27063; GFX11-NEXT:    s_setpc_b64 s[30:31]
27064  %op = fcmp ole bfloat %a, %b
27065  ret i1 %op
27066}
27067
27068define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
27069; GCN-LABEL: v_fcmp_one_bf16:
27070; GCN:       ; %bb.0:
27071; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27072; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27073; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27074; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27075; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27076; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
27077; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27078; GCN-NEXT:    s_setpc_b64 s[30:31]
27079;
27080; GFX7-LABEL: v_fcmp_one_bf16:
27081; GFX7:       ; %bb.0:
27082; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27083; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27084; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27085; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27086; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27087; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
27088; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27089; GFX7-NEXT:    s_setpc_b64 s[30:31]
27090;
27091; GFX8-LABEL: v_fcmp_one_bf16:
27092; GFX8:       ; %bb.0:
27093; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27094; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27095; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27096; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
27097; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27098; GFX8-NEXT:    s_setpc_b64 s[30:31]
27099;
27100; GFX9-LABEL: v_fcmp_one_bf16:
27101; GFX9:       ; %bb.0:
27102; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27103; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27104; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27105; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
27106; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27107; GFX9-NEXT:    s_setpc_b64 s[30:31]
27108;
27109; GFX10-LABEL: v_fcmp_one_bf16:
27110; GFX10:       ; %bb.0:
27111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27112; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27113; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27114; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
27115; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27116; GFX10-NEXT:    s_setpc_b64 s[30:31]
27117;
27118; GFX11-LABEL: v_fcmp_one_bf16:
27119; GFX11:       ; %bb.0:
27120; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27121; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27122; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27123; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27124; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
27125; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27126; GFX11-NEXT:    s_setpc_b64 s[30:31]
27127  %op = fcmp one bfloat %a, %b
27128  ret i1 %op
27129}
27130
27131define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
27132; GCN-LABEL: v_fcmp_uno_bf16:
27133; GCN:       ; %bb.0:
27134; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27135; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27136; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27137; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27138; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27139; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
27140; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27141; GCN-NEXT:    s_setpc_b64 s[30:31]
27142;
27143; GFX7-LABEL: v_fcmp_uno_bf16:
27144; GFX7:       ; %bb.0:
27145; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27146; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27147; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27148; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27149; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27150; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
27151; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27152; GFX7-NEXT:    s_setpc_b64 s[30:31]
27153;
27154; GFX8-LABEL: v_fcmp_uno_bf16:
27155; GFX8:       ; %bb.0:
27156; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27157; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27158; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27159; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
27160; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27161; GFX8-NEXT:    s_setpc_b64 s[30:31]
27162;
27163; GFX9-LABEL: v_fcmp_uno_bf16:
27164; GFX9:       ; %bb.0:
27165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27166; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27167; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27168; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
27169; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27170; GFX9-NEXT:    s_setpc_b64 s[30:31]
27171;
27172; GFX10-LABEL: v_fcmp_uno_bf16:
27173; GFX10:       ; %bb.0:
27174; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27175; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27176; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27177; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
27178; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27179; GFX10-NEXT:    s_setpc_b64 s[30:31]
27180;
27181; GFX11-LABEL: v_fcmp_uno_bf16:
27182; GFX11:       ; %bb.0:
27183; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27184; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27185; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27186; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27187; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
27188; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27189; GFX11-NEXT:    s_setpc_b64 s[30:31]
27190  %op = fcmp uno bfloat %a, %b
27191  ret i1 %op
27192}
27193
27194define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
27195; GCN-LABEL: v_fcmp_ueq_bf16:
27196; GCN:       ; %bb.0:
27197; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27198; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27199; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27200; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27201; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27202; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
27203; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27204; GCN-NEXT:    s_setpc_b64 s[30:31]
27205;
27206; GFX7-LABEL: v_fcmp_ueq_bf16:
27207; GFX7:       ; %bb.0:
27208; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27209; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27210; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27211; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27212; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27213; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
27214; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27215; GFX7-NEXT:    s_setpc_b64 s[30:31]
27216;
27217; GFX8-LABEL: v_fcmp_ueq_bf16:
27218; GFX8:       ; %bb.0:
27219; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27220; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27221; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27222; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
27223; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27224; GFX8-NEXT:    s_setpc_b64 s[30:31]
27225;
27226; GFX9-LABEL: v_fcmp_ueq_bf16:
27227; GFX9:       ; %bb.0:
27228; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27229; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27230; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27231; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
27232; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27233; GFX9-NEXT:    s_setpc_b64 s[30:31]
27234;
27235; GFX10-LABEL: v_fcmp_ueq_bf16:
27236; GFX10:       ; %bb.0:
27237; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27238; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27239; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27240; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27241; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27242; GFX10-NEXT:    s_setpc_b64 s[30:31]
27243;
27244; GFX11-LABEL: v_fcmp_ueq_bf16:
27245; GFX11:       ; %bb.0:
27246; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27247; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27248; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27250; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27251; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27252; GFX11-NEXT:    s_setpc_b64 s[30:31]
27253  %op = fcmp ueq bfloat %a, %b
27254  ret i1 %op
27255}
27256
27257define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
27258; GCN-LABEL: v_fcmp_ugt_bf16:
27259; GCN:       ; %bb.0:
27260; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27261; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27262; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27263; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27264; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27265; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
27266; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27267; GCN-NEXT:    s_setpc_b64 s[30:31]
27268;
27269; GFX7-LABEL: v_fcmp_ugt_bf16:
27270; GFX7:       ; %bb.0:
27271; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27272; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27273; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27274; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27275; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27276; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
27277; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27278; GFX7-NEXT:    s_setpc_b64 s[30:31]
27279;
27280; GFX8-LABEL: v_fcmp_ugt_bf16:
27281; GFX8:       ; %bb.0:
27282; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27283; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27284; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27285; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
27286; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27287; GFX8-NEXT:    s_setpc_b64 s[30:31]
27288;
27289; GFX9-LABEL: v_fcmp_ugt_bf16:
27290; GFX9:       ; %bb.0:
27291; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27292; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27293; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27294; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
27295; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27296; GFX9-NEXT:    s_setpc_b64 s[30:31]
27297;
27298; GFX10-LABEL: v_fcmp_ugt_bf16:
27299; GFX10:       ; %bb.0:
27300; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27301; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27302; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27303; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
27304; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27305; GFX10-NEXT:    s_setpc_b64 s[30:31]
27306;
27307; GFX11-LABEL: v_fcmp_ugt_bf16:
27308; GFX11:       ; %bb.0:
27309; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27310; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27311; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27312; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27313; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
27314; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27315; GFX11-NEXT:    s_setpc_b64 s[30:31]
27316  %op = fcmp ugt bfloat %a, %b
27317  ret i1 %op
27318}
27319
27320define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
27321; GCN-LABEL: v_fcmp_uge_bf16:
27322; GCN:       ; %bb.0:
27323; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27324; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27325; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27326; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27327; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27328; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
27329; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27330; GCN-NEXT:    s_setpc_b64 s[30:31]
27331;
27332; GFX7-LABEL: v_fcmp_uge_bf16:
27333; GFX7:       ; %bb.0:
27334; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27335; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27336; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27337; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27338; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27339; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
27340; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27341; GFX7-NEXT:    s_setpc_b64 s[30:31]
27342;
27343; GFX8-LABEL: v_fcmp_uge_bf16:
27344; GFX8:       ; %bb.0:
27345; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27346; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27347; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27348; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
27349; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27350; GFX8-NEXT:    s_setpc_b64 s[30:31]
27351;
27352; GFX9-LABEL: v_fcmp_uge_bf16:
27353; GFX9:       ; %bb.0:
27354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27355; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27356; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27357; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
27358; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27359; GFX9-NEXT:    s_setpc_b64 s[30:31]
27360;
27361; GFX10-LABEL: v_fcmp_uge_bf16:
27362; GFX10:       ; %bb.0:
27363; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27364; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27365; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27366; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27367; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27368; GFX10-NEXT:    s_setpc_b64 s[30:31]
27369;
27370; GFX11-LABEL: v_fcmp_uge_bf16:
27371; GFX11:       ; %bb.0:
27372; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27373; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27374; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27376; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27377; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27378; GFX11-NEXT:    s_setpc_b64 s[30:31]
27379  %op = fcmp uge bfloat %a, %b
27380  ret i1 %op
27381}
27382
27383define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
27384; GCN-LABEL: v_fcmp_ult_bf16:
27385; GCN:       ; %bb.0:
27386; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27387; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27388; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27389; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27390; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27391; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
27392; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27393; GCN-NEXT:    s_setpc_b64 s[30:31]
27394;
27395; GFX7-LABEL: v_fcmp_ult_bf16:
27396; GFX7:       ; %bb.0:
27397; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27398; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27399; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27400; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27401; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27402; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
27403; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27404; GFX7-NEXT:    s_setpc_b64 s[30:31]
27405;
27406; GFX8-LABEL: v_fcmp_ult_bf16:
27407; GFX8:       ; %bb.0:
27408; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27409; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27410; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27411; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
27412; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27413; GFX8-NEXT:    s_setpc_b64 s[30:31]
27414;
27415; GFX9-LABEL: v_fcmp_ult_bf16:
27416; GFX9:       ; %bb.0:
27417; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27418; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27419; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27420; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
27421; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27422; GFX9-NEXT:    s_setpc_b64 s[30:31]
27423;
27424; GFX10-LABEL: v_fcmp_ult_bf16:
27425; GFX10:       ; %bb.0:
27426; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27427; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27428; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27429; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
27430; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27431; GFX10-NEXT:    s_setpc_b64 s[30:31]
27432;
27433; GFX11-LABEL: v_fcmp_ult_bf16:
27434; GFX11:       ; %bb.0:
27435; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27436; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27437; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27438; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27439; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
27440; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27441; GFX11-NEXT:    s_setpc_b64 s[30:31]
27442  %op = fcmp ult bfloat %a, %b
27443  ret i1 %op
27444}
27445
27446define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
27447; GCN-LABEL: v_fcmp_ule_bf16:
27448; GCN:       ; %bb.0:
27449; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27450; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27451; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27452; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27453; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27454; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
27455; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27456; GCN-NEXT:    s_setpc_b64 s[30:31]
27457;
27458; GFX7-LABEL: v_fcmp_ule_bf16:
27459; GFX7:       ; %bb.0:
27460; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27461; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27462; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27463; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27464; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27465; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
27466; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27467; GFX7-NEXT:    s_setpc_b64 s[30:31]
27468;
27469; GFX8-LABEL: v_fcmp_ule_bf16:
27470; GFX8:       ; %bb.0:
27471; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27472; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27473; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27474; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
27475; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27476; GFX8-NEXT:    s_setpc_b64 s[30:31]
27477;
27478; GFX9-LABEL: v_fcmp_ule_bf16:
27479; GFX9:       ; %bb.0:
27480; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27481; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27482; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27483; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
27484; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27485; GFX9-NEXT:    s_setpc_b64 s[30:31]
27486;
27487; GFX10-LABEL: v_fcmp_ule_bf16:
27488; GFX10:       ; %bb.0:
27489; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27490; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27491; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27492; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27493; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27494; GFX10-NEXT:    s_setpc_b64 s[30:31]
27495;
27496; GFX11-LABEL: v_fcmp_ule_bf16:
27497; GFX11:       ; %bb.0:
27498; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27499; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27500; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27501; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27502; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27503; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27504; GFX11-NEXT:    s_setpc_b64 s[30:31]
27505  %op = fcmp ule bfloat %a, %b
27506  ret i1 %op
27507}
27508
27509define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
27510; GCN-LABEL: v_fcmp_une_bf16:
27511; GCN:       ; %bb.0:
27512; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27513; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27514; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27515; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27516; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27517; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
27518; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27519; GCN-NEXT:    s_setpc_b64 s[30:31]
27520;
27521; GFX7-LABEL: v_fcmp_une_bf16:
27522; GFX7:       ; %bb.0:
27523; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27524; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27525; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
27526; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
27527; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
27528; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
27529; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27530; GFX7-NEXT:    s_setpc_b64 s[30:31]
27531;
27532; GFX8-LABEL: v_fcmp_une_bf16:
27533; GFX8:       ; %bb.0:
27534; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27535; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27536; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27537; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
27538; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27539; GFX8-NEXT:    s_setpc_b64 s[30:31]
27540;
27541; GFX9-LABEL: v_fcmp_une_bf16:
27542; GFX9:       ; %bb.0:
27543; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27544; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27545; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27546; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
27547; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
27548; GFX9-NEXT:    s_setpc_b64 s[30:31]
27549;
27550; GFX10-LABEL: v_fcmp_une_bf16:
27551; GFX10:       ; %bb.0:
27552; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27553; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27554; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27555; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
27556; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27557; GFX10-NEXT:    s_setpc_b64 s[30:31]
27558;
27559; GFX11-LABEL: v_fcmp_une_bf16:
27560; GFX11:       ; %bb.0:
27561; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27562; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
27563; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27564; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27565; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
27566; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27567; GFX11-NEXT:    s_setpc_b64 s[30:31]
27568  %op = fcmp une bfloat %a, %b
27569  ret i1 %op
27570}
27571
27572define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
27573; GCN-LABEL: v_fcmp_true_bf16:
27574; GCN:       ; %bb.0:
27575; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27576; GCN-NEXT:    v_mov_b32_e32 v0, 1
27577; GCN-NEXT:    s_setpc_b64 s[30:31]
27578;
27579; GFX7-LABEL: v_fcmp_true_bf16:
27580; GFX7:       ; %bb.0:
27581; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27582; GFX7-NEXT:    v_mov_b32_e32 v0, 1
27583; GFX7-NEXT:    s_setpc_b64 s[30:31]
27584;
27585; GFX8-LABEL: v_fcmp_true_bf16:
27586; GFX8:       ; %bb.0:
27587; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27588; GFX8-NEXT:    v_mov_b32_e32 v0, 1
27589; GFX8-NEXT:    s_setpc_b64 s[30:31]
27590;
27591; GFX9-LABEL: v_fcmp_true_bf16:
27592; GFX9:       ; %bb.0:
27593; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27594; GFX9-NEXT:    v_mov_b32_e32 v0, 1
27595; GFX9-NEXT:    s_setpc_b64 s[30:31]
27596;
27597; GFX10-LABEL: v_fcmp_true_bf16:
27598; GFX10:       ; %bb.0:
27599; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27600; GFX10-NEXT:    v_mov_b32_e32 v0, 1
27601; GFX10-NEXT:    s_setpc_b64 s[30:31]
27602;
27603; GFX11-LABEL: v_fcmp_true_bf16:
27604; GFX11:       ; %bb.0:
27605; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27606; GFX11-NEXT:    v_mov_b32_e32 v0, 1
27607; GFX11-NEXT:    s_setpc_b64 s[30:31]
27608  %op = fcmp true bfloat %a, %b
27609  ret i1 %op
27610}
27611
27612declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
27613
27614define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
27615; GCN-LABEL: v_copysign_bf16_bf16:
27616; GCN:       ; %bb.0:
27617; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27618; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27619; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
27620; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27621; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27622; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
27623; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27624; GCN-NEXT:    s_setpc_b64 s[30:31]
27625;
27626; GFX7-LABEL: v_copysign_bf16_bf16:
27627; GFX7:       ; %bb.0:
27628; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27629; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27630; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
27631; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27632; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27633; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
27634; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27635; GFX7-NEXT:    s_setpc_b64 s[30:31]
27636;
27637; GFX8-LABEL: v_copysign_bf16_bf16:
27638; GFX8:       ; %bb.0:
27639; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27640; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27641; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
27642; GFX8-NEXT:    s_setpc_b64 s[30:31]
27643;
27644; GFX9-LABEL: v_copysign_bf16_bf16:
27645; GFX9:       ; %bb.0:
27646; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27647; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27648; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
27649; GFX9-NEXT:    s_setpc_b64 s[30:31]
27650;
27651; GFX10-LABEL: v_copysign_bf16_bf16:
27652; GFX10:       ; %bb.0:
27653; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27654; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27655; GFX10-NEXT:    s_setpc_b64 s[30:31]
27656;
27657; GFX11-LABEL: v_copysign_bf16_bf16:
27658; GFX11:       ; %bb.0:
27659; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27660; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27661; GFX11-NEXT:    s_setpc_b64 s[30:31]
27662  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27663  ret bfloat %op
27664}
27665
27666define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
27667; GCN-LABEL: v_copysign_bf16_s_bf16:
27668; GCN:       ; %bb.0:
27669; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27670; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27671; GCN-NEXT:    s_and_b32 s4, s16, 0x80000000
27672; GCN-NEXT:    s_lshr_b32 s4, s4, 16
27673; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27674; GCN-NEXT:    v_or_b32_e32 v0, s4, v0
27675; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27676; GCN-NEXT:    s_setpc_b64 s[30:31]
27677;
27678; GFX7-LABEL: v_copysign_bf16_s_bf16:
27679; GFX7:       ; %bb.0:
27680; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27681; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27682; GFX7-NEXT:    s_and_b32 s4, s16, 0x80000000
27683; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
27684; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27685; GFX7-NEXT:    v_or_b32_e32 v0, s4, v0
27686; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27687; GFX7-NEXT:    s_setpc_b64 s[30:31]
27688;
27689; GFX8-LABEL: v_copysign_bf16_s_bf16:
27690; GFX8:       ; %bb.0:
27691; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27692; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27693; GFX8-NEXT:    v_mov_b32_e32 v1, s16
27694; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
27695; GFX8-NEXT:    s_setpc_b64 s[30:31]
27696;
27697; GFX9-LABEL: v_copysign_bf16_s_bf16:
27698; GFX9:       ; %bb.0:
27699; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27700; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27701; GFX9-NEXT:    v_mov_b32_e32 v1, s16
27702; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
27703; GFX9-NEXT:    s_setpc_b64 s[30:31]
27704;
27705; GFX10-LABEL: v_copysign_bf16_s_bf16:
27706; GFX10:       ; %bb.0:
27707; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27708; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s16
27709; GFX10-NEXT:    s_setpc_b64 s[30:31]
27710;
27711; GFX11-LABEL: v_copysign_bf16_s_bf16:
27712; GFX11:       ; %bb.0:
27713; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27714; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s0
27715; GFX11-NEXT:    s_setpc_b64 s[30:31]
27716  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27717  ret bfloat %op
27718}
27719
27720define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
27721; GCN-LABEL: v_copysign_s_bf16_bf16:
27722; GCN:       ; %bb.0:
27723; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27724; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s16
27725; GCN-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
27726; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
27727; GCN-NEXT:    v_bfe_u32 v1, v1, 16, 15
27728; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
27729; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27730; GCN-NEXT:    s_setpc_b64 s[30:31]
27731;
27732; GFX7-LABEL: v_copysign_s_bf16_bf16:
27733; GFX7:       ; %bb.0:
27734; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27735; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s16
27736; GFX7-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
27737; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
27738; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 15
27739; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
27740; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27741; GFX7-NEXT:    s_setpc_b64 s[30:31]
27742;
27743; GFX8-LABEL: v_copysign_s_bf16_bf16:
27744; GFX8:       ; %bb.0:
27745; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27746; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27747; GFX8-NEXT:    v_mov_b32_e32 v1, s16
27748; GFX8-NEXT:    v_bfi_b32 v0, s4, v1, v0
27749; GFX8-NEXT:    s_setpc_b64 s[30:31]
27750;
27751; GFX9-LABEL: v_copysign_s_bf16_bf16:
27752; GFX9:       ; %bb.0:
27753; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27754; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27755; GFX9-NEXT:    v_mov_b32_e32 v1, s16
27756; GFX9-NEXT:    v_bfi_b32 v0, s4, v1, v0
27757; GFX9-NEXT:    s_setpc_b64 s[30:31]
27758;
27759; GFX10-LABEL: v_copysign_s_bf16_bf16:
27760; GFX10:       ; %bb.0:
27761; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27762; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s16, v0
27763; GFX10-NEXT:    s_setpc_b64 s[30:31]
27764;
27765; GFX11-LABEL: v_copysign_s_bf16_bf16:
27766; GFX11:       ; %bb.0:
27767; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27768; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
27769; GFX11-NEXT:    s_setpc_b64 s[30:31]
27770  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27771  ret bfloat %op
27772}
27773
27774define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
27775; GCN-LABEL: v_copysign_bf16_f32:
27776; GCN:       ; %bb.0:
27777; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27778; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27779; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
27780; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27781; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27782; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
27783; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27784; GCN-NEXT:    s_setpc_b64 s[30:31]
27785;
27786; GFX7-LABEL: v_copysign_bf16_f32:
27787; GFX7:       ; %bb.0:
27788; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27789; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27790; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
27791; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27792; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27793; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
27794; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27795; GFX7-NEXT:    s_setpc_b64 s[30:31]
27796;
27797; GFX8-LABEL: v_copysign_bf16_f32:
27798; GFX8:       ; %bb.0:
27799; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27800; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27801; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27802; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
27803; GFX8-NEXT:    s_setpc_b64 s[30:31]
27804;
27805; GFX9-LABEL: v_copysign_bf16_f32:
27806; GFX9:       ; %bb.0:
27807; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27808; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27809; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27810; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
27811; GFX9-NEXT:    s_setpc_b64 s[30:31]
27812;
27813; GFX10-LABEL: v_copysign_bf16_f32:
27814; GFX10:       ; %bb.0:
27815; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27816; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27817; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27818; GFX10-NEXT:    s_setpc_b64 s[30:31]
27819;
27820; GFX11-LABEL: v_copysign_bf16_f32:
27821; GFX11:       ; %bb.0:
27822; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27823; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27824; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27825; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27826; GFX11-NEXT:    s_setpc_b64 s[30:31]
27827  %sign = fptrunc float %sign.f32 to bfloat
27828  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27829  ret bfloat %op
27830}
27831
27832define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
27833; GCN-LABEL: v_copysign_bf16_f64:
27834; GCN:       ; %bb.0:
27835; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27836; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27837; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
27838; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27839; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27840; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
27841; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27842; GCN-NEXT:    s_setpc_b64 s[30:31]
27843;
27844; GFX7-LABEL: v_copysign_bf16_f64:
27845; GFX7:       ; %bb.0:
27846; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27847; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27848; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
27849; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
27850; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27851; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
27852; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27853; GFX7-NEXT:    s_setpc_b64 s[30:31]
27854;
27855; GFX8-LABEL: v_copysign_bf16_f64:
27856; GFX8:       ; %bb.0:
27857; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27858; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
27859; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27860; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
27861; GFX8-NEXT:    s_setpc_b64 s[30:31]
27862;
27863; GFX9-LABEL: v_copysign_bf16_f64:
27864; GFX9:       ; %bb.0:
27865; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27866; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
27867; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27868; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
27869; GFX9-NEXT:    s_setpc_b64 s[30:31]
27870;
27871; GFX10-LABEL: v_copysign_bf16_f64:
27872; GFX10:       ; %bb.0:
27873; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27874; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
27875; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27876; GFX10-NEXT:    s_setpc_b64 s[30:31]
27877;
27878; GFX11-LABEL: v_copysign_bf16_f64:
27879; GFX11:       ; %bb.0:
27880; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27881; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
27882; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27883; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27884; GFX11-NEXT:    s_setpc_b64 s[30:31]
27885  %sign = fptrunc double %sign.f64 to bfloat
27886  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27887  ret bfloat %op
27888}
27889
27890define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
27891; GCN-LABEL: v_copysign_bf16_f16:
27892; GCN:       ; %bb.0:
27893; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27894; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27895; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
27896; GCN-NEXT:    v_and_b32_e32 v1, 0x8000, v1
27897; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27898; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
27899; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27900; GCN-NEXT:    s_setpc_b64 s[30:31]
27901;
27902; GFX7-LABEL: v_copysign_bf16_f16:
27903; GFX7:       ; %bb.0:
27904; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27905; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
27906; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
27907; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27908; GFX7-NEXT:    v_and_b32_e32 v1, 0x8000, v1
27909; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
27910; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
27911; GFX7-NEXT:    s_setpc_b64 s[30:31]
27912;
27913; GFX8-LABEL: v_copysign_bf16_f16:
27914; GFX8:       ; %bb.0:
27915; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27916; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
27917; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
27918; GFX8-NEXT:    s_setpc_b64 s[30:31]
27919;
27920; GFX9-LABEL: v_copysign_bf16_f16:
27921; GFX9:       ; %bb.0:
27922; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27923; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
27924; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
27925; GFX9-NEXT:    s_setpc_b64 s[30:31]
27926;
27927; GFX10-LABEL: v_copysign_bf16_f16:
27928; GFX10:       ; %bb.0:
27929; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27930; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27931; GFX10-NEXT:    s_setpc_b64 s[30:31]
27932;
27933; GFX11-LABEL: v_copysign_bf16_f16:
27934; GFX11:       ; %bb.0:
27935; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27936; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
27937; GFX11-NEXT:    s_setpc_b64 s[30:31]
27938  %sign = bitcast half %sign.f16 to bfloat
27939  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27940  ret bfloat %op
27941}
27942
27943define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
27944; GCN-LABEL: s_copysign_bf16_bf16:
27945; GCN:       ; %bb.0:
27946; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
27947; GCN-NEXT:    s_and_b32 s0, s1, 0x80000000
27948; GCN-NEXT:    s_lshr_b32 s0, s0, 16
27949; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
27950; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
27951; GCN-NEXT:    v_readfirstlane_b32 s0, v0
27952; GCN-NEXT:    ; return to shader part epilog
27953;
27954; GFX7-LABEL: s_copysign_bf16_bf16:
27955; GFX7:       ; %bb.0:
27956; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
27957; GFX7-NEXT:    s_and_b32 s0, s1, 0x80000000
27958; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
27959; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
27960; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
27961; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
27962; GFX7-NEXT:    ; return to shader part epilog
27963;
27964; GFX8-LABEL: s_copysign_bf16_bf16:
27965; GFX8:       ; %bb.0:
27966; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
27967; GFX8-NEXT:    v_mov_b32_e32 v0, s0
27968; GFX8-NEXT:    v_mov_b32_e32 v1, s1
27969; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
27970; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
27971; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
27972; GFX8-NEXT:    ; return to shader part epilog
27973;
27974; GFX9-LABEL: s_copysign_bf16_bf16:
27975; GFX9:       ; %bb.0:
27976; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
27977; GFX9-NEXT:    v_mov_b32_e32 v0, s0
27978; GFX9-NEXT:    v_mov_b32_e32 v1, s1
27979; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
27980; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
27981; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
27982; GFX9-NEXT:    ; return to shader part epilog
27983;
27984; GFX10-LABEL: s_copysign_bf16_bf16:
27985; GFX10:       ; %bb.0:
27986; GFX10-NEXT:    v_mov_b32_e32 v0, s1
27987; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
27988; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
27989; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
27990; GFX10-NEXT:    ; return to shader part epilog
27991;
27992; GFX11-LABEL: s_copysign_bf16_bf16:
27993; GFX11:       ; %bb.0:
27994; GFX11-NEXT:    v_mov_b32_e32 v0, s1
27995; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
27996; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
27997; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
27998; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
27999; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28000; GFX11-NEXT:    ; return to shader part epilog
28001  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28002  %cast = bitcast bfloat %op to i16
28003  %zext = zext i16 %cast to i32
28004  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28005  ret i32 %readlane
28006}
28007
28008define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
28009; GCN-LABEL: s_copysign_bf16_f32:
28010; GCN:       ; %bb.0:
28011; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
28012; GCN-NEXT:    s_and_b32 s0, s1, 0x80000000
28013; GCN-NEXT:    s_lshr_b32 s0, s0, 16
28014; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
28015; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
28016; GCN-NEXT:    v_readfirstlane_b32 s0, v0
28017; GCN-NEXT:    ; return to shader part epilog
28018;
28019; GFX7-LABEL: s_copysign_bf16_f32:
28020; GFX7:       ; %bb.0:
28021; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
28022; GFX7-NEXT:    s_and_b32 s0, s1, 0x80000000
28023; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
28024; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
28025; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
28026; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
28027; GFX7-NEXT:    ; return to shader part epilog
28028;
28029; GFX8-LABEL: s_copysign_bf16_f32:
28030; GFX8:       ; %bb.0:
28031; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
28032; GFX8-NEXT:    s_movk_i32 s1, 0x7fff
28033; GFX8-NEXT:    v_mov_b32_e32 v1, s0
28034; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
28035; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28036; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
28037; GFX8-NEXT:    ; return to shader part epilog
28038;
28039; GFX9-LABEL: s_copysign_bf16_f32:
28040; GFX9:       ; %bb.0:
28041; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
28042; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
28043; GFX9-NEXT:    v_mov_b32_e32 v1, s0
28044; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
28045; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28046; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
28047; GFX9-NEXT:    ; return to shader part epilog
28048;
28049; GFX10-LABEL: s_copysign_bf16_f32:
28050; GFX10:       ; %bb.0:
28051; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
28052; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28053; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28054; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
28055; GFX10-NEXT:    ; return to shader part epilog
28056;
28057; GFX11-LABEL: s_copysign_bf16_f32:
28058; GFX11:       ; %bb.0:
28059; GFX11-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
28060; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28061; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28062; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28063; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28064; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28065; GFX11-NEXT:    ; return to shader part epilog
28066  %sign = fptrunc float %sign.f32 to bfloat
28067  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28068  %cast = bitcast bfloat %op to i16
28069  %zext = zext i16 %cast to i32
28070  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28071  ret i32 %readlane
28072}
28073
28074define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
28075; GCN-LABEL: s_copysign_bf16_f64:
28076; GCN:       ; %bb.0:
28077; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
28078; GCN-NEXT:    s_and_b32 s0, s2, 0x80000000
28079; GCN-NEXT:    s_lshr_b32 s0, s0, 16
28080; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
28081; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
28082; GCN-NEXT:    v_readfirstlane_b32 s0, v0
28083; GCN-NEXT:    ; return to shader part epilog
28084;
28085; GFX7-LABEL: s_copysign_bf16_f64:
28086; GFX7:       ; %bb.0:
28087; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
28088; GFX7-NEXT:    s_and_b32 s0, s2, 0x80000000
28089; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
28090; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
28091; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
28092; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
28093; GFX7-NEXT:    ; return to shader part epilog
28094;
28095; GFX8-LABEL: s_copysign_bf16_f64:
28096; GFX8:       ; %bb.0:
28097; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
28098; GFX8-NEXT:    s_movk_i32 s1, 0x7fff
28099; GFX8-NEXT:    v_mov_b32_e32 v1, s0
28100; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
28101; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28102; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
28103; GFX8-NEXT:    ; return to shader part epilog
28104;
28105; GFX9-LABEL: s_copysign_bf16_f64:
28106; GFX9:       ; %bb.0:
28107; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
28108; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
28109; GFX9-NEXT:    v_mov_b32_e32 v1, s0
28110; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
28111; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28112; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
28113; GFX9-NEXT:    ; return to shader part epilog
28114;
28115; GFX10-LABEL: s_copysign_bf16_f64:
28116; GFX10:       ; %bb.0:
28117; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
28118; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28119; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28120; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
28121; GFX10-NEXT:    ; return to shader part epilog
28122;
28123; GFX11-LABEL: s_copysign_bf16_f64:
28124; GFX11:       ; %bb.0:
28125; GFX11-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
28126; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28127; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28128; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28130; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28131; GFX11-NEXT:    ; return to shader part epilog
28132  %sign = fptrunc double %sign.f64 to bfloat
28133  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28134  %cast = bitcast bfloat %op to i16
28135  %zext = zext i16 %cast to i32
28136  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28137  ret i32 %readlane
28138}
28139
28140define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
28141; GCN-LABEL: s_copysign_bf16_f16:
28142; GCN:       ; %bb.0:
28143; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
28144; GCN-NEXT:    v_cvt_f16_f32_e32 v1, s1
28145; GCN-NEXT:    v_and_b32_e32 v1, 0x8000, v1
28146; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
28147; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
28148; GCN-NEXT:    v_readfirstlane_b32 s0, v0
28149; GCN-NEXT:    ; return to shader part epilog
28150;
28151; GFX7-LABEL: s_copysign_bf16_f16:
28152; GFX7:       ; %bb.0:
28153; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s1
28154; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s0
28155; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 15
28156; GFX7-NEXT:    v_and_b32_e32 v0, 0x8000, v0
28157; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
28158; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
28159; GFX7-NEXT:    ; return to shader part epilog
28160;
28161; GFX8-LABEL: s_copysign_bf16_f16:
28162; GFX8:       ; %bb.0:
28163; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
28164; GFX8-NEXT:    v_mov_b32_e32 v0, s0
28165; GFX8-NEXT:    v_mov_b32_e32 v1, s1
28166; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
28167; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28168; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
28169; GFX8-NEXT:    ; return to shader part epilog
28170;
28171; GFX9-LABEL: s_copysign_bf16_f16:
28172; GFX9:       ; %bb.0:
28173; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
28174; GFX9-NEXT:    v_mov_b32_e32 v0, s0
28175; GFX9-NEXT:    v_mov_b32_e32 v1, s1
28176; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
28177; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28178; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
28179; GFX9-NEXT:    ; return to shader part epilog
28180;
28181; GFX10-LABEL: s_copysign_bf16_f16:
28182; GFX10:       ; %bb.0:
28183; GFX10-NEXT:    v_mov_b32_e32 v0, s1
28184; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28185; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28186; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
28187; GFX10-NEXT:    ; return to shader part epilog
28188;
28189; GFX11-LABEL: s_copysign_bf16_f16:
28190; GFX11:       ; %bb.0:
28191; GFX11-NEXT:    v_mov_b32_e32 v0, s1
28192; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28193; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28194; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28195; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28196; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28197; GFX11-NEXT:    ; return to shader part epilog
28198  %sign = bitcast half %sign.f16 to bfloat
28199  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28200  %cast = bitcast bfloat %op to i16
28201  %zext = zext i16 %cast to i32
28202  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28203  ret i32 %readlane
28204}
28205
28206declare float @llvm.copysign.f32(float, float)
28207
28208define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
28209; GCN-LABEL: v_copysign_f32_bf16:
28210; GCN:       ; %bb.0:
28211; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28212; GCN-NEXT:    s_brev_b32 s4, -2
28213; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
28214; GCN-NEXT:    s_setpc_b64 s[30:31]
28215;
28216; GFX7-LABEL: v_copysign_f32_bf16:
28217; GFX7:       ; %bb.0:
28218; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28219; GFX7-NEXT:    s_brev_b32 s4, -2
28220; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
28221; GFX7-NEXT:    s_setpc_b64 s[30:31]
28222;
28223; GFX8-LABEL: v_copysign_f32_bf16:
28224; GFX8:       ; %bb.0:
28225; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28226; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28227; GFX8-NEXT:    s_brev_b32 s4, -2
28228; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
28229; GFX8-NEXT:    s_setpc_b64 s[30:31]
28230;
28231; GFX9-LABEL: v_copysign_f32_bf16:
28232; GFX9:       ; %bb.0:
28233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28234; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28235; GFX9-NEXT:    s_brev_b32 s4, -2
28236; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
28237; GFX9-NEXT:    s_setpc_b64 s[30:31]
28238;
28239; GFX10-LABEL: v_copysign_f32_bf16:
28240; GFX10:       ; %bb.0:
28241; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28242; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28243; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
28244; GFX10-NEXT:    s_setpc_b64 s[30:31]
28245;
28246; GFX11-LABEL: v_copysign_f32_bf16:
28247; GFX11:       ; %bb.0:
28248; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28249; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28250; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28251; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
28252; GFX11-NEXT:    s_setpc_b64 s[30:31]
28253  %sign = fpext bfloat %sign.bf16 to float
28254  %op = call float @llvm.copysign.f32(float %mag, float %sign)
28255  ret float %op
28256}
28257
28258define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
28259; GCN-LABEL: s_copysign_f32_bf16:
28260; GCN:       ; %bb.0:
28261; GCN-NEXT:    s_brev_b32 s2, -2
28262; GCN-NEXT:    v_mov_b32_e32 v0, s0
28263; GCN-NEXT:    v_mov_b32_e32 v1, s1
28264; GCN-NEXT:    v_bfi_b32 v0, s2, v0, v1
28265; GCN-NEXT:    v_readfirstlane_b32 s0, v0
28266; GCN-NEXT:    ; return to shader part epilog
28267;
28268; GFX7-LABEL: s_copysign_f32_bf16:
28269; GFX7:       ; %bb.0:
28270; GFX7-NEXT:    s_brev_b32 s2, -2
28271; GFX7-NEXT:    v_mov_b32_e32 v0, s0
28272; GFX7-NEXT:    v_mov_b32_e32 v1, s1
28273; GFX7-NEXT:    v_bfi_b32 v0, s2, v0, v1
28274; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
28275; GFX7-NEXT:    ; return to shader part epilog
28276;
28277; GFX8-LABEL: s_copysign_f32_bf16:
28278; GFX8:       ; %bb.0:
28279; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
28280; GFX8-NEXT:    s_brev_b32 s1, -2
28281; GFX8-NEXT:    v_mov_b32_e32 v1, s0
28282; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
28283; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
28284; GFX8-NEXT:    ; return to shader part epilog
28285;
28286; GFX9-LABEL: s_copysign_f32_bf16:
28287; GFX9:       ; %bb.0:
28288; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
28289; GFX9-NEXT:    s_brev_b32 s1, -2
28290; GFX9-NEXT:    v_mov_b32_e32 v1, s0
28291; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
28292; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
28293; GFX9-NEXT:    ; return to shader part epilog
28294;
28295; GFX10-LABEL: s_copysign_f32_bf16:
28296; GFX10:       ; %bb.0:
28297; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
28298; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
28299; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
28300; GFX10-NEXT:    ; return to shader part epilog
28301;
28302; GFX11-LABEL: s_copysign_f32_bf16:
28303; GFX11:       ; %bb.0:
28304; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
28305; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28306; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
28307; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28308; GFX11-NEXT:    ; return to shader part epilog
28309  %sign = fpext bfloat %sign.bf16 to float
28310  %op = call float @llvm.copysign.f32(float %mag, float %sign)
28311  %cast = bitcast float %op to i32
28312  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
28313  ret i32 %readlane
28314}
28315
28316declare half @llvm.copysign.f16(half, half)
28317
28318define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
28319; GCN-LABEL: v_copysign_f16_bf16:
28320; GCN:       ; %bb.0:
28321; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28322; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28323; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
28324; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
28325; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
28326; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
28327; GCN-NEXT:    s_brev_b32 s4, -2
28328; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
28329; GCN-NEXT:    s_setpc_b64 s[30:31]
28330;
28331; GFX7-LABEL: v_copysign_f16_bf16:
28332; GFX7:       ; %bb.0:
28333; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28334; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
28335; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28336; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
28337; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
28338; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
28339; GFX7-NEXT:    s_brev_b32 s4, -2
28340; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
28341; GFX7-NEXT:    s_setpc_b64 s[30:31]
28342;
28343; GFX8-LABEL: v_copysign_f16_bf16:
28344; GFX8:       ; %bb.0:
28345; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28346; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
28347; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
28348; GFX8-NEXT:    s_setpc_b64 s[30:31]
28349;
28350; GFX9-LABEL: v_copysign_f16_bf16:
28351; GFX9:       ; %bb.0:
28352; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28353; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
28354; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
28355; GFX9-NEXT:    s_setpc_b64 s[30:31]
28356;
28357; GFX10-LABEL: v_copysign_f16_bf16:
28358; GFX10:       ; %bb.0:
28359; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28360; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
28361; GFX10-NEXT:    s_setpc_b64 s[30:31]
28362;
28363; GFX11-LABEL: v_copysign_f16_bf16:
28364; GFX11:       ; %bb.0:
28365; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28366; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
28367; GFX11-NEXT:    s_setpc_b64 s[30:31]
28368  %sign = bitcast bfloat %sign.bf16 to half
28369  %op = call half @llvm.copysign.f16(half %mag, half %sign)
28370  ret half %op
28371}
28372
28373define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
28374; GCN-LABEL: s_copysign_f16_bf16:
28375; GCN:       ; %bb.0:
28376; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s1
28377; GCN-NEXT:    v_cvt_f16_f32_e32 v1, s0
28378; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
28379; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
28380; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
28381; GCN-NEXT:    s_brev_b32 s0, -2
28382; GCN-NEXT:    v_bfi_b32 v0, s0, v1, v0
28383; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
28384; GCN-NEXT:    v_readfirstlane_b32 s0, v0
28385; GCN-NEXT:    ; return to shader part epilog
28386;
28387; GFX7-LABEL: s_copysign_f16_bf16:
28388; GFX7:       ; %bb.0:
28389; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s0
28390; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
28391; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
28392; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
28393; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
28394; GFX7-NEXT:    s_brev_b32 s0, -2
28395; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
28396; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
28397; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
28398; GFX7-NEXT:    ; return to shader part epilog
28399;
28400; GFX8-LABEL: s_copysign_f16_bf16:
28401; GFX8:       ; %bb.0:
28402; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
28403; GFX8-NEXT:    v_mov_b32_e32 v0, s0
28404; GFX8-NEXT:    v_mov_b32_e32 v1, s1
28405; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
28406; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28407; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
28408; GFX8-NEXT:    ; return to shader part epilog
28409;
28410; GFX9-LABEL: s_copysign_f16_bf16:
28411; GFX9:       ; %bb.0:
28412; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
28413; GFX9-NEXT:    v_mov_b32_e32 v0, s0
28414; GFX9-NEXT:    v_mov_b32_e32 v1, s1
28415; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
28416; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28417; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
28418; GFX9-NEXT:    ; return to shader part epilog
28419;
28420; GFX10-LABEL: s_copysign_f16_bf16:
28421; GFX10:       ; %bb.0:
28422; GFX10-NEXT:    v_mov_b32_e32 v0, s1
28423; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28424; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28425; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
28426; GFX10-NEXT:    ; return to shader part epilog
28427;
28428; GFX11-LABEL: s_copysign_f16_bf16:
28429; GFX11:       ; %bb.0:
28430; GFX11-NEXT:    v_mov_b32_e32 v0, s1
28431; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28432; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
28433; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28434; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28435; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
28436; GFX11-NEXT:    ; return to shader part epilog
28437  %sign = bitcast bfloat %sign.bf16 to half
28438  %op = call half @llvm.copysign.f16(half %mag, half %sign)
28439  %cast = bitcast half %op to i16
28440  %zext = zext i16 %cast to i32
28441  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28442  ret i32 %readlane
28443}
28444
28445declare double @llvm.copysign.f64(double, double)
28446
28447define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
28448; GCN-LABEL: v_copysign_f64_bf16:
28449; GCN:       ; %bb.0:
28450; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28451; GCN-NEXT:    s_brev_b32 s4, -2
28452; GCN-NEXT:    v_bfi_b32 v1, s4, v1, v2
28453; GCN-NEXT:    s_setpc_b64 s[30:31]
28454;
28455; GFX7-LABEL: v_copysign_f64_bf16:
28456; GFX7:       ; %bb.0:
28457; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28458; GFX7-NEXT:    s_brev_b32 s4, -2
28459; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
28460; GFX7-NEXT:    s_setpc_b64 s[30:31]
28461;
28462; GFX8-LABEL: v_copysign_f64_bf16:
28463; GFX8:       ; %bb.0:
28464; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28465; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
28466; GFX8-NEXT:    s_brev_b32 s4, -2
28467; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
28468; GFX8-NEXT:    s_setpc_b64 s[30:31]
28469;
28470; GFX9-LABEL: v_copysign_f64_bf16:
28471; GFX9:       ; %bb.0:
28472; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28473; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
28474; GFX9-NEXT:    s_brev_b32 s4, -2
28475; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
28476; GFX9-NEXT:    s_setpc_b64 s[30:31]
28477;
28478; GFX10-LABEL: v_copysign_f64_bf16:
28479; GFX10:       ; %bb.0:
28480; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28481; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
28482; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
28483; GFX10-NEXT:    s_setpc_b64 s[30:31]
28484;
28485; GFX11-LABEL: v_copysign_f64_bf16:
28486; GFX11:       ; %bb.0:
28487; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28488; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
28489; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28490; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
28491; GFX11-NEXT:    s_setpc_b64 s[30:31]
28492  %sign = fpext bfloat %sign.bf16 to double
28493  %op = call double @llvm.copysign.f64(double %mag, double %sign)
28494  ret double %op
28495}
28496
28497define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
28498; GCN-LABEL: s_copysign_f64_bf16:
28499; GCN:       ; %bb.0:
28500; GCN-NEXT:    s_brev_b32 s3, -2
28501; GCN-NEXT:    v_mov_b32_e32 v0, s1
28502; GCN-NEXT:    v_mov_b32_e32 v1, s2
28503; GCN-NEXT:    v_bfi_b32 v0, s3, v0, v1
28504; GCN-NEXT:    v_readfirstlane_b32 s1, v0
28505; GCN-NEXT:    ; return to shader part epilog
28506;
28507; GFX7-LABEL: s_copysign_f64_bf16:
28508; GFX7:       ; %bb.0:
28509; GFX7-NEXT:    s_brev_b32 s3, -2
28510; GFX7-NEXT:    v_mov_b32_e32 v0, s1
28511; GFX7-NEXT:    v_mov_b32_e32 v1, s2
28512; GFX7-NEXT:    v_bfi_b32 v0, s3, v0, v1
28513; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
28514; GFX7-NEXT:    ; return to shader part epilog
28515;
28516; GFX8-LABEL: s_copysign_f64_bf16:
28517; GFX8:       ; %bb.0:
28518; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
28519; GFX8-NEXT:    s_brev_b32 s2, -2
28520; GFX8-NEXT:    v_mov_b32_e32 v1, s1
28521; GFX8-NEXT:    v_bfi_b32 v0, s2, v1, v0
28522; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
28523; GFX8-NEXT:    ; return to shader part epilog
28524;
28525; GFX9-LABEL: s_copysign_f64_bf16:
28526; GFX9:       ; %bb.0:
28527; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
28528; GFX9-NEXT:    s_brev_b32 s2, -2
28529; GFX9-NEXT:    v_mov_b32_e32 v1, s1
28530; GFX9-NEXT:    v_bfi_b32 v0, s2, v1, v0
28531; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
28532; GFX9-NEXT:    ; return to shader part epilog
28533;
28534; GFX10-LABEL: s_copysign_f64_bf16:
28535; GFX10:       ; %bb.0:
28536; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
28537; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
28538; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
28539; GFX10-NEXT:    ; return to shader part epilog
28540;
28541; GFX11-LABEL: s_copysign_f64_bf16:
28542; GFX11:       ; %bb.0:
28543; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
28544; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28545; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
28546; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
28547; GFX11-NEXT:    ; return to shader part epilog
28548  %sign = fpext bfloat %sign.bf16 to double
28549  %op = call double @llvm.copysign.f64(double %mag, double %sign)
28550  %cast = bitcast double %op to <2 x i32>
28551  %cast.0 = extractelement <2 x i32> %cast, i32 0
28552  %cast.1 = extractelement <2 x i32> %cast, i32 1
28553  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
28554  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
28555  %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
28556  %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
28557  ret <2 x i32> %ins.1
28558}
28559
28560define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
28561; GCN-LABEL: v_fptosi_bf16_to_i16:
28562; GCN:       ; %bb.0:
28563; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28564; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28565; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28566; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28567; GCN-NEXT:    s_setpc_b64 s[30:31]
28568;
28569; GFX7-LABEL: v_fptosi_bf16_to_i16:
28570; GFX7:       ; %bb.0:
28571; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28572; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28573; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28574; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
28575; GFX7-NEXT:    s_setpc_b64 s[30:31]
28576;
28577; GFX8-LABEL: v_fptosi_bf16_to_i16:
28578; GFX8:       ; %bb.0:
28579; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28580; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28581; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
28582; GFX8-NEXT:    s_setpc_b64 s[30:31]
28583;
28584; GFX9-LABEL: v_fptosi_bf16_to_i16:
28585; GFX9:       ; %bb.0:
28586; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28587; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28588; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
28589; GFX9-NEXT:    s_setpc_b64 s[30:31]
28590;
28591; GFX10-LABEL: v_fptosi_bf16_to_i16:
28592; GFX10:       ; %bb.0:
28593; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28594; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28595; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
28596; GFX10-NEXT:    s_setpc_b64 s[30:31]
28597;
28598; GFX11-LABEL: v_fptosi_bf16_to_i16:
28599; GFX11:       ; %bb.0:
28600; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28601; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28602; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28603; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
28604; GFX11-NEXT:    s_setpc_b64 s[30:31]
28605  %op = fptosi bfloat %x to i16
28606  ret i16 %op
28607}
28608
28609define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
28610; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
28611; GCN:       ; %bb.0:
28612; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28613; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28614; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28615; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28616; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28617; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
28618; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28619; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28620; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28621; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
28622; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
28623; GCN-NEXT:    s_setpc_b64 s[30:31]
28624;
28625; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
28626; GFX7:       ; %bb.0:
28627; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28628; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28629; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28630; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28631; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28632; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
28633; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
28634; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28635; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28636; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
28637; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
28638; GFX7-NEXT:    s_setpc_b64 s[30:31]
28639;
28640; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
28641; GFX8:       ; %bb.0:
28642; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28643; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
28644; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28645; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28646; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
28647; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28648; GFX8-NEXT:    s_setpc_b64 s[30:31]
28649;
28650; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
28651; GFX9:       ; %bb.0:
28652; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28653; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
28654; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28655; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
28656; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
28657; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
28658; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
28659; GFX9-NEXT:    s_setpc_b64 s[30:31]
28660;
28661; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
28662; GFX10:       ; %bb.0:
28663; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28664; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
28665; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28666; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
28667; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
28668; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
28669; GFX10-NEXT:    s_setpc_b64 s[30:31]
28670;
28671; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28672; GFX11TRUE16:       ; %bb.0:
28673; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28674; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
28675; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28676; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28677; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28678; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28679; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28680; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
28681; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
28682;
28683; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28684; GFX11FAKE16:       ; %bb.0:
28685; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28686; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
28687; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28688; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28689; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28690; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28691; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28692; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
28693; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
28694  %op = fptosi <2 x bfloat> %x to <2 x i16>
28695  ret <2 x i16> %op
28696}
28697
28698define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
28699; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
28700; GCN:       ; %bb.0:
28701; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28702; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
28703; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28704; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28705; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28706; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28707; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
28708; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
28709; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28710; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v2
28711; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28712; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28713; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
28714; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
28715; GCN-NEXT:    v_alignbit_b32 v1, v3, v1, 16
28716; GCN-NEXT:    s_setpc_b64 s[30:31]
28717;
28718; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16:
28719; GFX7:       ; %bb.0:
28720; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28721; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28722; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28723; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
28724; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28725; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28726; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
28727; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
28728; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
28729; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v2
28730; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28731; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28732; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
28733; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v3
28734; GFX7-NEXT:    v_alignbit_b32 v1, v3, v1, 16
28735; GFX7-NEXT:    s_setpc_b64 s[30:31]
28736;
28737; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
28738; GFX8:       ; %bb.0:
28739; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28740; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
28741; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28742; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
28743; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28744; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28745; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
28746; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28747; GFX8-NEXT:    s_setpc_b64 s[30:31]
28748;
28749; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
28750; GFX9:       ; %bb.0:
28751; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28752; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
28753; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28754; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
28755; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
28756; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28757; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
28758; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
28759; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
28760; GFX9-NEXT:    s_setpc_b64 s[30:31]
28761;
28762; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
28763; GFX10:       ; %bb.0:
28764; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28765; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
28766; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28767; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28768; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
28769; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
28770; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
28771; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
28772; GFX10-NEXT:    s_setpc_b64 s[30:31]
28773;
28774; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28775; GFX11TRUE16:       ; %bb.0:
28776; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28777; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
28778; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28779; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28780; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28781; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
28782; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28783; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28784; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28785; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
28786; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
28787;
28788; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28789; GFX11FAKE16:       ; %bb.0:
28790; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28791; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
28792; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28793; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28794; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28795; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
28796; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28797; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28798; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28799; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
28800; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
28801  %op = fptosi <3 x bfloat> %x to <3 x i16>
28802  ret <3 x i16> %op
28803}
28804
28805define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
28806; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
28807; GCN:       ; %bb.0:
28808; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28809; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
28810; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
28811; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28812; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28813; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28814; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28815; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
28816; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
28817; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
28818; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28819; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
28820; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
28821; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28822; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28823; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
28824; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
28825; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
28826; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
28827; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
28828; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
28829; GCN-NEXT:    s_setpc_b64 s[30:31]
28830;
28831; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
28832; GFX7:       ; %bb.0:
28833; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28834; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
28835; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
28836; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28837; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28838; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
28839; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
28840; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28841; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28842; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v3
28843; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
28844; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
28845; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
28846; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
28847; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
28848; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28849; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
28850; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
28851; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
28852; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
28853; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
28854; GFX7-NEXT:    s_setpc_b64 s[30:31]
28855;
28856; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
28857; GFX8:       ; %bb.0:
28858; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28859; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28860; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28861; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
28862; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28863; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
28864; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
28865; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28866; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28867; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28868; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28869; GFX8-NEXT:    s_setpc_b64 s[30:31]
28870;
28871; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
28872; GFX9:       ; %bb.0:
28873; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28874; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28875; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28876; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
28877; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28878; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
28879; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
28880; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
28881; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
28882; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
28883; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
28884; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
28885; GFX9-NEXT:    s_setpc_b64 s[30:31]
28886;
28887; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
28888; GFX10:       ; %bb.0:
28889; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28890; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28891; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
28892; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28893; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28894; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
28895; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
28896; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
28897; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
28898; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
28899; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
28900; GFX10-NEXT:    s_setpc_b64 s[30:31]
28901;
28902; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28903; GFX11TRUE16:       ; %bb.0:
28904; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28905; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
28906; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28907; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
28908; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
28909; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28910; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
28911; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28912; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28913; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
28914; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28915; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28916; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
28917; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
28918; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
28919;
28920; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28921; GFX11FAKE16:       ; %bb.0:
28922; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28923; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
28924; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
28925; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28926; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28927; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28928; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
28929; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
28930; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28931; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
28932; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
28933; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28934; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
28935; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
28936; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
28937  %op = fptosi <4 x bfloat> %x to <4 x i16>
28938  ret <4 x i16> %op
28939}
28940
28941define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
28942; GCN-LABEL: v_fptosi_bf16_to_i32:
28943; GCN:       ; %bb.0:
28944; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28945; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28946; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28947; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28948; GCN-NEXT:    s_setpc_b64 s[30:31]
28949;
28950; GFX7-LABEL: v_fptosi_bf16_to_i32:
28951; GFX7:       ; %bb.0:
28952; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28953; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28954; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28955; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
28956; GFX7-NEXT:    s_setpc_b64 s[30:31]
28957;
28958; GFX8-LABEL: v_fptosi_bf16_to_i32:
28959; GFX8:       ; %bb.0:
28960; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28961; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28962; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
28963; GFX8-NEXT:    s_setpc_b64 s[30:31]
28964;
28965; GFX9-LABEL: v_fptosi_bf16_to_i32:
28966; GFX9:       ; %bb.0:
28967; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28968; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28969; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
28970; GFX9-NEXT:    s_setpc_b64 s[30:31]
28971;
28972; GFX10-LABEL: v_fptosi_bf16_to_i32:
28973; GFX10:       ; %bb.0:
28974; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28975; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28976; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
28977; GFX10-NEXT:    s_setpc_b64 s[30:31]
28978;
28979; GFX11-LABEL: v_fptosi_bf16_to_i32:
28980; GFX11:       ; %bb.0:
28981; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28982; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
28983; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
28984; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
28985; GFX11-NEXT:    s_setpc_b64 s[30:31]
28986  %op = fptosi bfloat %x to i32
28987  ret i32 %op
28988}
28989
28990define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
28991; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
28992; GCN:       ; %bb.0:
28993; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28994; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
28995; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
28996; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
28997; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
28998; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
28999; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
29000; GCN-NEXT:    s_setpc_b64 s[30:31]
29001;
29002; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32:
29003; GFX7:       ; %bb.0:
29004; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29005; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29006; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29007; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29008; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29009; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
29010; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
29011; GFX7-NEXT:    s_setpc_b64 s[30:31]
29012;
29013; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32:
29014; GFX8:       ; %bb.0:
29015; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29016; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29017; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v1
29018; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29019; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v0
29020; GFX8-NEXT:    v_mov_b32_e32 v0, v2
29021; GFX8-NEXT:    s_setpc_b64 s[30:31]
29022;
29023; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32:
29024; GFX9:       ; %bb.0:
29025; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29026; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29027; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v1
29028; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29029; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v0
29030; GFX9-NEXT:    v_mov_b32_e32 v0, v2
29031; GFX9-NEXT:    s_setpc_b64 s[30:31]
29032;
29033; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
29034; GFX10:       ; %bb.0:
29035; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29036; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29037; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
29038; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v1
29039; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v2
29040; GFX10-NEXT:    s_setpc_b64 s[30:31]
29041;
29042; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
29043; GFX11:       ; %bb.0:
29044; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29045; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29046; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
29047; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29048; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v1
29049; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v2
29050; GFX11-NEXT:    s_setpc_b64 s[30:31]
29051  %op = fptosi <2 x bfloat> %x to <2 x i32>
29052  ret <2 x i32> %op
29053}
29054
29055define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
29056; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
29057; GCN:       ; %bb.0:
29058; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29059; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29060; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29061; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29062; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29063; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29064; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29065; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
29066; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
29067; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
29068; GCN-NEXT:    s_setpc_b64 s[30:31]
29069;
29070; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32:
29071; GFX7:       ; %bb.0:
29072; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29073; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29074; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29075; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29076; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29077; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29078; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29079; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
29080; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
29081; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
29082; GFX7-NEXT:    s_setpc_b64 s[30:31]
29083;
29084; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32:
29085; GFX8:       ; %bb.0:
29086; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29087; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29088; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29089; GFX8-NEXT:    v_cvt_i32_f32_e32 v4, v2
29090; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v0
29091; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
29092; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v0
29093; GFX8-NEXT:    v_mov_b32_e32 v0, v4
29094; GFX8-NEXT:    v_mov_b32_e32 v1, v3
29095; GFX8-NEXT:    s_setpc_b64 s[30:31]
29096;
29097; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
29098; GFX9:       ; %bb.0:
29099; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29100; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29101; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29102; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
29103; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v0
29104; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
29105; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v0
29106; GFX9-NEXT:    v_mov_b32_e32 v0, v4
29107; GFX9-NEXT:    v_mov_b32_e32 v1, v3
29108; GFX9-NEXT:    s_setpc_b64 s[30:31]
29109;
29110; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
29111; GFX10:       ; %bb.0:
29112; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29113; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29114; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
29115; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
29116; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v2
29117; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v3
29118; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v4
29119; GFX10-NEXT:    s_setpc_b64 s[30:31]
29120;
29121; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
29122; GFX11:       ; %bb.0:
29123; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29124; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29125; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
29126; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
29127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29128; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v2
29129; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v3
29130; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
29131; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v4
29132; GFX11-NEXT:    s_setpc_b64 s[30:31]
29133  %op = fptosi <3 x bfloat> %x to <3 x i32>
29134  ret <3 x i32> %op
29135}
29136
29137define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
29138; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
29139; GCN:       ; %bb.0:
29140; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29141; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
29142; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29143; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29144; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29145; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29146; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29147; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29148; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
29149; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
29150; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
29151; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
29152; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
29153; GCN-NEXT:    s_setpc_b64 s[30:31]
29154;
29155; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32:
29156; GFX7:       ; %bb.0:
29157; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29158; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
29159; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29160; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29161; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29162; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29163; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29164; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29165; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
29166; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
29167; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
29168; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
29169; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v3
29170; GFX7-NEXT:    s_setpc_b64 s[30:31]
29171;
29172; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32:
29173; GFX8:       ; %bb.0:
29174; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29175; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29176; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29177; GFX8-NEXT:    v_cvt_i32_f32_e32 v5, v0
29178; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
29179; GFX8-NEXT:    v_cvt_i32_f32_e32 v4, v2
29180; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v0
29181; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
29182; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v0
29183; GFX8-NEXT:    v_mov_b32_e32 v0, v4
29184; GFX8-NEXT:    v_mov_b32_e32 v1, v5
29185; GFX8-NEXT:    s_setpc_b64 s[30:31]
29186;
29187; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
29188; GFX9:       ; %bb.0:
29189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29190; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29191; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29192; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v0
29193; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
29194; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
29195; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v0
29196; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
29197; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v0
29198; GFX9-NEXT:    v_mov_b32_e32 v0, v4
29199; GFX9-NEXT:    v_mov_b32_e32 v1, v5
29200; GFX9-NEXT:    s_setpc_b64 s[30:31]
29201;
29202; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
29203; GFX10:       ; %bb.0:
29204; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29205; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29206; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
29207; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
29208; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
29209; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v2
29210; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v3
29211; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v4
29212; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v5
29213; GFX10-NEXT:    s_setpc_b64 s[30:31]
29214;
29215; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
29216; GFX11:       ; %bb.0:
29217; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29218; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29219; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
29220; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
29221; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
29222; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29223; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v2
29224; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v3
29225; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29226; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v4
29227; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v5
29228; GFX11-NEXT:    s_setpc_b64 s[30:31]
29229  %op = fptosi <4 x bfloat> %x to <4 x i32>
29230  ret <4 x i32> %op
29231}
29232
29233define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
29234; GCN-LABEL: v_fptosi_bf16_to_i64:
29235; GCN:       ; %bb.0:
29236; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29237; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29238; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
29239; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
29240; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29241; GCN-NEXT:    v_trunc_f32_e32 v0, v0
29242; GCN-NEXT:    v_mul_f32_e64 v1, |v0|, s4
29243; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
29244; GCN-NEXT:    v_floor_f32_e32 v1, v1
29245; GCN-NEXT:    v_fma_f32 v0, v1, s5, |v0|
29246; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
29247; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
29248; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
29249; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
29250; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
29251; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
29252; GCN-NEXT:    s_setpc_b64 s[30:31]
29253;
29254; GFX7-LABEL: v_fptosi_bf16_to_i64:
29255; GFX7:       ; %bb.0:
29256; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29257; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29258; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29259; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
29260; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
29261; GFX7-NEXT:    v_mul_f32_e64 v1, |v0|, s4
29262; GFX7-NEXT:    v_floor_f32_e32 v1, v1
29263; GFX7-NEXT:    s_mov_b32 s4, 0xcf800000
29264; GFX7-NEXT:    v_fma_f32 v2, v1, s4, |v0|
29265; GFX7-NEXT:    v_cvt_u32_f32_e32 v2, v2
29266; GFX7-NEXT:    v_cvt_u32_f32_e32 v1, v1
29267; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29268; GFX7-NEXT:    v_xor_b32_e32 v0, v2, v3
29269; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
29270; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
29271; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
29272; GFX7-NEXT:    s_setpc_b64 s[30:31]
29273;
29274; GFX8-LABEL: v_fptosi_bf16_to_i64:
29275; GFX8:       ; %bb.0:
29276; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29277; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
29278; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
29279; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
29280; GFX8-NEXT:    v_mul_f32_e64 v1, |v0|, s4
29281; GFX8-NEXT:    v_floor_f32_e32 v1, v1
29282; GFX8-NEXT:    s_mov_b32 s4, 0xcf800000
29283; GFX8-NEXT:    v_fma_f32 v2, v1, s4, |v0|
29284; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
29285; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
29286; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29287; GFX8-NEXT:    v_xor_b32_e32 v0, v2, v3
29288; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
29289; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
29290; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
29291; GFX8-NEXT:    s_setpc_b64 s[30:31]
29292;
29293; GFX9-LABEL: v_fptosi_bf16_to_i64:
29294; GFX9:       ; %bb.0:
29295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29296; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
29297; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
29298; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
29299; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
29300; GFX9-NEXT:    v_floor_f32_e32 v1, v1
29301; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
29302; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
29303; GFX9-NEXT:    v_fma_f32 v1, v1, s4, |v0|
29304; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
29305; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29306; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v3
29307; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
29308; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
29309; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
29310; GFX9-NEXT:    s_setpc_b64 s[30:31]
29311;
29312; GFX10-LABEL: v_fptosi_bf16_to_i64:
29313; GFX10:       ; %bb.0:
29314; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29315; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
29316; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
29317; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
29318; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29319; GFX10-NEXT:    v_floor_f32_e32 v1, v1
29320; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
29321; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
29322; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
29323; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
29324; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
29325; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
29326; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29327; GFX10-NEXT:    s_setpc_b64 s[30:31]
29328;
29329; GFX11-LABEL: v_fptosi_bf16_to_i64:
29330; GFX11:       ; %bb.0:
29331; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29332; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
29333; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29334; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
29335; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
29336; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29337; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29338; GFX11-NEXT:    v_floor_f32_e32 v1, v1
29339; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
29340; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
29341; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29342; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
29343; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
29344; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29345; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
29346; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
29347; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
29348; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29349; GFX11-NEXT:    s_setpc_b64 s[30:31]
29350  %op = fptosi bfloat %x to i64
29351  ret i64 %op
29352}
29353
29354define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
29355; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
29356; GCN:       ; %bb.0:
29357; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29358; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29359; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29360; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
29361; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
29362; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29363; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29364; GCN-NEXT:    v_trunc_f32_e32 v0, v0
29365; GCN-NEXT:    v_trunc_f32_e32 v1, v1
29366; GCN-NEXT:    v_mul_f32_e64 v2, |v0|, s4
29367; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
29368; GCN-NEXT:    v_mul_f32_e64 v4, |v1|, s4
29369; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
29370; GCN-NEXT:    v_floor_f32_e32 v2, v2
29371; GCN-NEXT:    v_floor_f32_e32 v4, v4
29372; GCN-NEXT:    v_fma_f32 v0, v2, s5, |v0|
29373; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
29374; GCN-NEXT:    v_fma_f32 v1, v4, s5, |v1|
29375; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
29376; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
29377; GCN-NEXT:    v_xor_b32_e32 v2, v2, v3
29378; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
29379; GCN-NEXT:    v_xor_b32_e32 v4, v4, v5
29380; GCN-NEXT:    v_xor_b32_e32 v0, v0, v3
29381; GCN-NEXT:    v_xor_b32_e32 v6, v1, v5
29382; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
29383; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v3, vcc
29384; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v6, v5
29385; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
29386; GCN-NEXT:    s_setpc_b64 s[30:31]
29387;
29388; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64:
29389; GFX7:       ; %bb.0:
29390; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29391; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29392; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29393; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
29394; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
29395; GFX7-NEXT:    v_mul_f32_e64 v2, |v0|, s4
29396; GFX7-NEXT:    v_floor_f32_e32 v2, v2
29397; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
29398; GFX7-NEXT:    v_fma_f32 v3, v2, s5, |v0|
29399; GFX7-NEXT:    v_cvt_u32_f32_e32 v3, v3
29400; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29401; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
29402; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29403; GFX7-NEXT:    v_xor_b32_e32 v0, v3, v4
29404; GFX7-NEXT:    v_trunc_f32_e32 v3, v1
29405; GFX7-NEXT:    v_mul_f32_e64 v1, |v3|, s4
29406; GFX7-NEXT:    v_floor_f32_e32 v1, v1
29407; GFX7-NEXT:    v_cvt_u32_f32_e32 v2, v2
29408; GFX7-NEXT:    v_fma_f32 v5, v1, s5, |v3|
29409; GFX7-NEXT:    v_cvt_u32_f32_e32 v5, v5
29410; GFX7-NEXT:    v_cvt_u32_f32_e32 v6, v1
29411; GFX7-NEXT:    v_xor_b32_e32 v2, v2, v4
29412; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
29413; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
29414; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
29415; GFX7-NEXT:    v_xor_b32_e32 v2, v5, v3
29416; GFX7-NEXT:    v_xor_b32_e32 v4, v6, v3
29417; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
29418; GFX7-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
29419; GFX7-NEXT:    s_setpc_b64 s[30:31]
29420;
29421; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64:
29422; GFX8:       ; %bb.0:
29423; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29424; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29425; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
29426; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
29427; GFX8-NEXT:    v_mul_f32_e64 v2, |v1|, s4
29428; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29429; GFX8-NEXT:    v_floor_f32_e32 v2, v2
29430; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
29431; GFX8-NEXT:    v_trunc_f32_e32 v4, v0
29432; GFX8-NEXT:    v_fma_f32 v3, v2, s5, |v1|
29433; GFX8-NEXT:    v_mul_f32_e64 v0, |v4|, s4
29434; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
29435; GFX8-NEXT:    v_floor_f32_e32 v0, v0
29436; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
29437; GFX8-NEXT:    v_fma_f32 v5, v0, s5, |v4|
29438; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
29439; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29440; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
29441; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v1
29442; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v1
29443; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v3, v1
29444; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
29445; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
29446; GFX8-NEXT:    v_xor_b32_e32 v2, v5, v3
29447; GFX8-NEXT:    v_xor_b32_e32 v4, v6, v3
29448; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
29449; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
29450; GFX8-NEXT:    s_setpc_b64 s[30:31]
29451;
29452; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
29453; GFX9:       ; %bb.0:
29454; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29455; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29456; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
29457; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
29458; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, s4
29459; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29460; GFX9-NEXT:    v_floor_f32_e32 v2, v2
29461; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
29462; GFX9-NEXT:    v_trunc_f32_e32 v4, v0
29463; GFX9-NEXT:    v_fma_f32 v3, v2, s5, |v1|
29464; GFX9-NEXT:    v_mul_f32_e64 v0, |v4|, s4
29465; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
29466; GFX9-NEXT:    v_floor_f32_e32 v0, v0
29467; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
29468; GFX9-NEXT:    v_fma_f32 v5, v0, s5, |v4|
29469; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
29470; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29471; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
29472; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v1
29473; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
29474; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
29475; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
29476; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
29477; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v3
29478; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v3
29479; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
29480; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29481; GFX9-NEXT:    s_setpc_b64 s[30:31]
29482;
29483; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
29484; GFX10:       ; %bb.0:
29485; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29486; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29487; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29488; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
29489; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
29490; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
29491; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v0|
29492; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
29493; GFX10-NEXT:    v_floor_f32_e32 v2, v2
29494; GFX10-NEXT:    v_floor_f32_e32 v3, v3
29495; GFX10-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v1|
29496; GFX10-NEXT:    v_fma_f32 v5, 0xcf800000, v3, |v0|
29497; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29498; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
29499; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
29500; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v4
29501; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v5
29502; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v1
29503; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v6
29504; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
29505; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v6
29506; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v1
29507; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29508; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
29509; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29510; GFX10-NEXT:    s_setpc_b64 s[30:31]
29511;
29512; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
29513; GFX11:       ; %bb.0:
29514; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29515; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
29516; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29517; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29518; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
29519; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
29520; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29521; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
29522; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v0|
29523; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
29524; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29525; GFX11-NEXT:    v_floor_f32_e32 v2, v2
29526; GFX11-NEXT:    v_floor_f32_e32 v3, v3
29527; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29528; GFX11-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v1|
29529; GFX11-NEXT:    v_fma_f32 v5, 0xcf800000, v3, |v0|
29530; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29531; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
29532; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
29533; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v4
29534; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v5
29535; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29536; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v1
29537; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v6
29538; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29539; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v1
29540; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v6
29541; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29542; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v1
29543; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29544; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
29545; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29546; GFX11-NEXT:    s_setpc_b64 s[30:31]
29547  %op = fptosi <2 x bfloat> %x to <2 x i64>
29548  ret <2 x i64> %op
29549}
29550
29551define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
29552; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
29553; GCN:       ; %bb.0:
29554; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29555; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29556; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29557; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29558; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
29559; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
29560; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29561; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29562; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29563; GCN-NEXT:    v_trunc_f32_e32 v0, v0
29564; GCN-NEXT:    v_trunc_f32_e32 v1, v1
29565; GCN-NEXT:    v_trunc_f32_e32 v2, v2
29566; GCN-NEXT:    v_mul_f32_e64 v3, |v0|, s4
29567; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
29568; GCN-NEXT:    v_mul_f32_e64 v5, |v1|, s4
29569; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
29570; GCN-NEXT:    v_mul_f32_e64 v7, |v2|, s4
29571; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
29572; GCN-NEXT:    v_floor_f32_e32 v3, v3
29573; GCN-NEXT:    v_floor_f32_e32 v5, v5
29574; GCN-NEXT:    v_floor_f32_e32 v7, v7
29575; GCN-NEXT:    v_fma_f32 v0, v3, s5, |v0|
29576; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
29577; GCN-NEXT:    v_fma_f32 v1, v5, s5, |v1|
29578; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
29579; GCN-NEXT:    v_fma_f32 v2, v7, s5, |v2|
29580; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
29581; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
29582; GCN-NEXT:    v_xor_b32_e32 v3, v3, v4
29583; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
29584; GCN-NEXT:    v_xor_b32_e32 v5, v5, v6
29585; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
29586; GCN-NEXT:    v_xor_b32_e32 v7, v7, v8
29587; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
29588; GCN-NEXT:    v_xor_b32_e32 v9, v1, v6
29589; GCN-NEXT:    v_xor_b32_e32 v10, v2, v8
29590; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
29591; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
29592; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v9, v6
29593; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v5, v6, vcc
29594; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v10, v8
29595; GCN-NEXT:    v_subb_u32_e32 v5, vcc, v7, v8, vcc
29596; GCN-NEXT:    s_setpc_b64 s[30:31]
29597;
29598; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64:
29599; GFX7:       ; %bb.0:
29600; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29601; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29602; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29603; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
29604; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
29605; GFX7-NEXT:    v_mul_f32_e64 v3, |v0|, s4
29606; GFX7-NEXT:    v_floor_f32_e32 v3, v3
29607; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
29608; GFX7-NEXT:    v_fma_f32 v4, v3, s5, |v0|
29609; GFX7-NEXT:    v_cvt_u32_f32_e32 v4, v4
29610; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29611; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
29612; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29613; GFX7-NEXT:    v_xor_b32_e32 v0, v4, v5
29614; GFX7-NEXT:    v_trunc_f32_e32 v4, v1
29615; GFX7-NEXT:    v_mul_f32_e64 v1, |v4|, s4
29616; GFX7-NEXT:    v_cvt_u32_f32_e32 v3, v3
29617; GFX7-NEXT:    v_floor_f32_e32 v1, v1
29618; GFX7-NEXT:    v_fma_f32 v6, v1, s5, |v4|
29619; GFX7-NEXT:    v_cvt_u32_f32_e32 v6, v6
29620; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29621; GFX7-NEXT:    v_xor_b32_e32 v3, v3, v5
29622; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
29623; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v1
29624; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v3, v5, vcc
29625; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
29626; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29627; GFX7-NEXT:    v_xor_b32_e32 v5, v6, v3
29628; GFX7-NEXT:    v_trunc_f32_e32 v6, v2
29629; GFX7-NEXT:    v_mul_f32_e64 v2, |v6|, s4
29630; GFX7-NEXT:    v_floor_f32_e32 v2, v2
29631; GFX7-NEXT:    v_xor_b32_e32 v4, v7, v3
29632; GFX7-NEXT:    v_fma_f32 v7, v2, s5, |v6|
29633; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v7
29634; GFX7-NEXT:    v_cvt_u32_f32_e32 v8, v2
29635; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v5, v3
29636; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v6
29637; GFX7-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
29638; GFX7-NEXT:    v_xor_b32_e32 v4, v7, v5
29639; GFX7-NEXT:    v_xor_b32_e32 v6, v8, v5
29640; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
29641; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
29642; GFX7-NEXT:    s_setpc_b64 s[30:31]
29643;
29644; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64:
29645; GFX8:       ; %bb.0:
29646; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29647; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29648; GFX8-NEXT:    v_trunc_f32_e32 v2, v2
29649; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
29650; GFX8-NEXT:    v_mul_f32_e64 v3, |v2|, s4
29651; GFX8-NEXT:    v_floor_f32_e32 v3, v3
29652; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
29653; GFX8-NEXT:    v_fma_f32 v4, v3, s5, |v2|
29654; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29655; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v4
29656; GFX8-NEXT:    v_trunc_f32_e32 v5, v0
29657; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
29658; GFX8-NEXT:    v_mul_f32_e64 v0, |v5|, s4
29659; GFX8-NEXT:    v_floor_f32_e32 v0, v0
29660; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
29661; GFX8-NEXT:    v_fma_f32 v6, v0, s5, |v5|
29662; GFX8-NEXT:    v_xor_b32_e32 v4, v4, v2
29663; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v6
29664; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
29665; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v2
29666; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v0
29667; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v2
29668; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
29669; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v3, v2, vcc
29670; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
29671; GFX8-NEXT:    v_mul_f32_e64 v5, |v1|, s4
29672; GFX8-NEXT:    v_floor_f32_e32 v5, v5
29673; GFX8-NEXT:    v_xor_b32_e32 v2, v7, v3
29674; GFX8-NEXT:    v_fma_f32 v7, v5, s5, |v1|
29675; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
29676; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
29677; GFX8-NEXT:    v_xor_b32_e32 v4, v8, v3
29678; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
29679; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29680; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
29681; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v1
29682; GFX8-NEXT:    v_xor_b32_e32 v5, v5, v1
29683; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v1
29684; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
29685; GFX8-NEXT:    v_mov_b32_e32 v1, v6
29686; GFX8-NEXT:    s_setpc_b64 s[30:31]
29687;
29688; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
29689; GFX9:       ; %bb.0:
29690; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29691; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29692; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
29693; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
29694; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
29695; GFX9-NEXT:    v_floor_f32_e32 v3, v3
29696; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
29697; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
29698; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29699; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
29700; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
29701; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
29702; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
29703; GFX9-NEXT:    v_floor_f32_e32 v0, v0
29704; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
29705; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
29706; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
29707; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
29708; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
29709; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
29710; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v0
29711; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
29712; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
29713; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
29714; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
29715; GFX9-NEXT:    v_mul_f32_e64 v5, |v1|, s4
29716; GFX9-NEXT:    v_floor_f32_e32 v5, v5
29717; GFX9-NEXT:    v_xor_b32_e32 v2, v7, v3
29718; GFX9-NEXT:    v_fma_f32 v7, v5, s5, |v1|
29719; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
29720; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
29721; GFX9-NEXT:    v_xor_b32_e32 v4, v8, v3
29722; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
29723; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29724; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29725; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v1
29726; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v1
29727; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
29728; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
29729; GFX9-NEXT:    v_mov_b32_e32 v1, v6
29730; GFX9-NEXT:    s_setpc_b64 s[30:31]
29731;
29732; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
29733; GFX10:       ; %bb.0:
29734; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29735; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29736; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29737; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
29738; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
29739; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
29740; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
29741; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v2|
29742; GFX10-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v0|
29743; GFX10-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v1|
29744; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
29745; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
29746; GFX10-NEXT:    v_floor_f32_e32 v3, v3
29747; GFX10-NEXT:    v_floor_f32_e32 v4, v4
29748; GFX10-NEXT:    v_floor_f32_e32 v6, v6
29749; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
29750; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v3, |v2|
29751; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v4, |v0|
29752; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v6, |v1|
29753; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
29754; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
29755; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
29756; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
29757; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
29758; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v5
29759; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
29760; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
29761; GFX10-NEXT:    v_xor_b32_e32 v9, v0, v7
29762; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v7
29763; GFX10-NEXT:    v_xor_b32_e32 v10, v1, v8
29764; GFX10-NEXT:    v_xor_b32_e32 v6, v6, v8
29765; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
29766; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29767; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v9, v7
29768; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29769; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v10, v8
29770; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29771; GFX10-NEXT:    s_setpc_b64 s[30:31]
29772;
29773; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
29774; GFX11:       ; %bb.0:
29775; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29776; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29777; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29778; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
29779; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29780; GFX11-NEXT:    v_trunc_f32_e32 v2, v2
29781; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
29782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29783; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
29784; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v2|
29785; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29786; GFX11-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v0|
29787; GFX11-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v1|
29788; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
29789; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
29790; GFX11-NEXT:    v_floor_f32_e32 v3, v3
29791; GFX11-NEXT:    v_floor_f32_e32 v4, v4
29792; GFX11-NEXT:    v_floor_f32_e32 v6, v6
29793; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
29794; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29795; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v3, |v2|
29796; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v4, |v0|
29797; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
29798; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v6, |v1|
29799; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
29800; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v4
29801; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
29802; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
29803; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
29804; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v5
29805; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
29806; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
29807; GFX11-NEXT:    v_xor_b32_e32 v9, v0, v7
29808; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v7
29809; GFX11-NEXT:    v_xor_b32_e32 v10, v1, v8
29810; GFX11-NEXT:    v_xor_b32_e32 v6, v6, v8
29811; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
29812; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29813; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v9, v7
29814; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29815; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v10, v8
29816; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29817; GFX11-NEXT:    s_setpc_b64 s[30:31]
29818  %op = fptosi <3 x bfloat> %x to <3 x i64>
29819  ret <3 x i64> %op
29820}
29821
29822define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
29823; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
29824; GCN:       ; %bb.0:
29825; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29826; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
29827; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29828; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29829; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29830; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
29831; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
29832; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29833; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29834; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29835; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
29836; GCN-NEXT:    v_trunc_f32_e32 v0, v0
29837; GCN-NEXT:    v_trunc_f32_e32 v1, v1
29838; GCN-NEXT:    v_trunc_f32_e32 v2, v2
29839; GCN-NEXT:    v_trunc_f32_e32 v3, v3
29840; GCN-NEXT:    v_mul_f32_e64 v4, |v0|, s4
29841; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
29842; GCN-NEXT:    v_mul_f32_e64 v6, |v1|, s4
29843; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
29844; GCN-NEXT:    v_mul_f32_e64 v8, |v2|, s4
29845; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v2
29846; GCN-NEXT:    v_mul_f32_e64 v10, |v3|, s4
29847; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
29848; GCN-NEXT:    v_floor_f32_e32 v4, v4
29849; GCN-NEXT:    v_floor_f32_e32 v6, v6
29850; GCN-NEXT:    v_floor_f32_e32 v8, v8
29851; GCN-NEXT:    v_floor_f32_e32 v10, v10
29852; GCN-NEXT:    v_fma_f32 v0, v4, s5, |v0|
29853; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
29854; GCN-NEXT:    v_fma_f32 v1, v6, s5, |v1|
29855; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
29856; GCN-NEXT:    v_fma_f32 v2, v8, s5, |v2|
29857; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
29858; GCN-NEXT:    v_fma_f32 v3, v10, s5, |v3|
29859; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
29860; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
29861; GCN-NEXT:    v_xor_b32_e32 v4, v4, v5
29862; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
29863; GCN-NEXT:    v_xor_b32_e32 v6, v6, v7
29864; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
29865; GCN-NEXT:    v_xor_b32_e32 v8, v8, v9
29866; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
29867; GCN-NEXT:    v_xor_b32_e32 v10, v10, v11
29868; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
29869; GCN-NEXT:    v_xor_b32_e32 v12, v1, v7
29870; GCN-NEXT:    v_xor_b32_e32 v13, v2, v9
29871; GCN-NEXT:    v_xor_b32_e32 v14, v3, v11
29872; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
29873; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v5, vcc
29874; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v12, v7
29875; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v6, v7, vcc
29876; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v13, v9
29877; GCN-NEXT:    v_subb_u32_e32 v5, vcc, v8, v9, vcc
29878; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v14, v11
29879; GCN-NEXT:    v_subb_u32_e32 v7, vcc, v10, v11, vcc
29880; GCN-NEXT:    s_setpc_b64 s[30:31]
29881;
29882; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64:
29883; GFX7:       ; %bb.0:
29884; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29885; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
29886; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29887; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
29888; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
29889; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v3
29890; GFX7-NEXT:    v_mul_f32_e64 v3, |v0|, s4
29891; GFX7-NEXT:    v_floor_f32_e32 v3, v3
29892; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
29893; GFX7-NEXT:    v_fma_f32 v5, v3, s5, |v0|
29894; GFX7-NEXT:    v_cvt_u32_f32_e32 v5, v5
29895; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
29896; GFX7-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
29897; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29898; GFX7-NEXT:    v_xor_b32_e32 v0, v5, v6
29899; GFX7-NEXT:    v_trunc_f32_e32 v5, v1
29900; GFX7-NEXT:    v_mul_f32_e64 v1, |v5|, s4
29901; GFX7-NEXT:    v_cvt_u32_f32_e32 v3, v3
29902; GFX7-NEXT:    v_floor_f32_e32 v1, v1
29903; GFX7-NEXT:    v_fma_f32 v7, v1, s5, |v5|
29904; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v7
29905; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
29906; GFX7-NEXT:    v_xor_b32_e32 v3, v3, v6
29907; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
29908; GFX7-NEXT:    v_cvt_u32_f32_e32 v8, v1
29909; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v3, v6, vcc
29910; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
29911; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
29912; GFX7-NEXT:    v_xor_b32_e32 v6, v7, v3
29913; GFX7-NEXT:    v_trunc_f32_e32 v7, v2
29914; GFX7-NEXT:    v_mul_f32_e64 v2, |v7|, s4
29915; GFX7-NEXT:    v_floor_f32_e32 v2, v2
29916; GFX7-NEXT:    v_xor_b32_e32 v5, v8, v3
29917; GFX7-NEXT:    v_fma_f32 v8, v2, s5, |v7|
29918; GFX7-NEXT:    v_cvt_u32_f32_e32 v8, v8
29919; GFX7-NEXT:    v_cvt_u32_f32_e32 v9, v2
29920; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v6, v3
29921; GFX7-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
29922; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
29923; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
29924; GFX7-NEXT:    v_xor_b32_e32 v7, v8, v5
29925; GFX7-NEXT:    v_trunc_f32_e32 v8, v4
29926; GFX7-NEXT:    v_mul_f32_e64 v4, |v8|, s4
29927; GFX7-NEXT:    v_floor_f32_e32 v4, v4
29928; GFX7-NEXT:    v_xor_b32_e32 v6, v9, v5
29929; GFX7-NEXT:    v_fma_f32 v9, v4, s5, |v8|
29930; GFX7-NEXT:    v_cvt_u32_f32_e32 v9, v9
29931; GFX7-NEXT:    v_cvt_u32_f32_e32 v10, v4
29932; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v7, v5
29933; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
29934; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
29935; GFX7-NEXT:    v_xor_b32_e32 v6, v9, v7
29936; GFX7-NEXT:    v_xor_b32_e32 v8, v10, v7
29937; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, v6, v7
29938; GFX7-NEXT:    v_subb_u32_e32 v7, vcc, v8, v7, vcc
29939; GFX7-NEXT:    s_setpc_b64 s[30:31]
29940;
29941; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64:
29942; GFX8:       ; %bb.0:
29943; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29944; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
29945; GFX8-NEXT:    v_trunc_f32_e32 v2, v2
29946; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
29947; GFX8-NEXT:    v_mul_f32_e64 v3, |v2|, s4
29948; GFX8-NEXT:    v_floor_f32_e32 v3, v3
29949; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
29950; GFX8-NEXT:    v_fma_f32 v4, v3, s5, |v2|
29951; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
29952; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v4
29953; GFX8-NEXT:    v_trunc_f32_e32 v5, v0
29954; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
29955; GFX8-NEXT:    v_mul_f32_e64 v0, |v5|, s4
29956; GFX8-NEXT:    v_floor_f32_e32 v0, v0
29957; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
29958; GFX8-NEXT:    v_fma_f32 v6, v0, s5, |v5|
29959; GFX8-NEXT:    v_xor_b32_e32 v4, v4, v2
29960; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
29961; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v2
29962; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v0
29963; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v2
29964; GFX8-NEXT:    v_subb_u32_e32 v8, vcc, v3, v2, vcc
29965; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
29966; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
29967; GFX8-NEXT:    v_trunc_f32_e32 v5, v5
29968; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v3
29969; GFX8-NEXT:    v_mul_f32_e64 v6, |v5|, s4
29970; GFX8-NEXT:    v_floor_f32_e32 v6, v6
29971; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v3
29972; GFX8-NEXT:    v_fma_f32 v7, v6, s5, |v5|
29973; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
29974; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
29975; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
29976; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
29977; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
29978; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
29979; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v5
29980; GFX8-NEXT:    v_mul_f32_e64 v7, |v1|, s4
29981; GFX8-NEXT:    v_floor_f32_e32 v7, v7
29982; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
29983; GFX8-NEXT:    v_fma_f32 v9, v7, s5, |v1|
29984; GFX8-NEXT:    v_cvt_u32_f32_e32 v9, v9
29985; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
29986; GFX8-NEXT:    v_xor_b32_e32 v6, v6, v5
29987; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
29988; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
29989; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
29990; GFX8-NEXT:    v_xor_b32_e32 v6, v9, v1
29991; GFX8-NEXT:    v_xor_b32_e32 v7, v7, v1
29992; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v1
29993; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v1, vcc
29994; GFX8-NEXT:    v_mov_b32_e32 v1, v8
29995; GFX8-NEXT:    s_setpc_b64 s[30:31]
29996;
29997; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
29998; GFX9:       ; %bb.0:
29999; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30000; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
30001; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
30002; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
30003; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
30004; GFX9-NEXT:    v_floor_f32_e32 v3, v3
30005; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
30006; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
30007; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30008; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
30009; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
30010; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
30011; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
30012; GFX9-NEXT:    v_floor_f32_e32 v0, v0
30013; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
30014; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
30015; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
30016; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
30017; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
30018; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
30019; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
30020; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
30021; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
30022; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
30023; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
30024; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v3
30025; GFX9-NEXT:    v_mul_f32_e64 v6, |v5|, s4
30026; GFX9-NEXT:    v_floor_f32_e32 v6, v6
30027; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v3
30028; GFX9-NEXT:    v_fma_f32 v7, v6, s5, |v5|
30029; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
30030; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30031; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
30032; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
30033; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
30034; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
30035; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v5
30036; GFX9-NEXT:    v_mul_f32_e64 v7, |v1|, s4
30037; GFX9-NEXT:    v_floor_f32_e32 v7, v7
30038; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
30039; GFX9-NEXT:    v_fma_f32 v9, v7, s5, |v1|
30040; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
30041; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
30042; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v5
30043; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
30044; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
30045; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
30046; GFX9-NEXT:    v_xor_b32_e32 v6, v9, v1
30047; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v1
30048; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
30049; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
30050; GFX9-NEXT:    v_mov_b32_e32 v1, v8
30051; GFX9-NEXT:    s_setpc_b64 s[30:31]
30052;
30053; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
30054; GFX10:       ; %bb.0:
30055; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30056; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
30057; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30058; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
30059; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30060; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
30061; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
30062; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
30063; GFX10-NEXT:    v_trunc_f32_e32 v4, v1
30064; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v2|
30065; GFX10-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v0|
30066; GFX10-NEXT:    v_mul_f32_e64 v8, 0x2f800000, |v3|
30067; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
30068; GFX10-NEXT:    v_mul_f32_e64 v9, 0x2f800000, |v4|
30069; GFX10-NEXT:    v_floor_f32_e32 v1, v1
30070; GFX10-NEXT:    v_floor_f32_e32 v6, v6
30071; GFX10-NEXT:    v_floor_f32_e32 v8, v8
30072; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
30073; GFX10-NEXT:    v_floor_f32_e32 v9, v9
30074; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v2|
30075; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v6, |v0|
30076; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
30077; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
30078; GFX10-NEXT:    v_fma_f32 v3, 0xcf800000, v8, |v3|
30079; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
30080; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
30081; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
30082; GFX10-NEXT:    v_fma_f32 v11, 0xcf800000, v9, |v4|
30083; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v5
30084; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
30085; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v3
30086; GFX10-NEXT:    v_xor_b32_e32 v3, v0, v7
30087; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
30088; GFX10-NEXT:    v_xor_b32_e32 v6, v6, v7
30089; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
30090; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
30091; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
30092; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v9
30093; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30094; GFX10-NEXT:    v_xor_b32_e32 v4, v12, v10
30095; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, v7
30096; GFX10-NEXT:    v_xor_b32_e32 v5, v8, v10
30097; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30098; GFX10-NEXT:    v_xor_b32_e32 v6, v11, v13
30099; GFX10-NEXT:    v_xor_b32_e32 v7, v9, v13
30100; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v10
30101; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30102; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, v13
30103; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30104; GFX10-NEXT:    s_setpc_b64 s[30:31]
30105;
30106; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
30107; GFX11:       ; %bb.0:
30108; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30109; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
30110; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30111; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
30112; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30113; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30114; GFX11-NEXT:    v_trunc_f32_e32 v2, v2
30115; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
30116; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30117; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
30118; GFX11-NEXT:    v_trunc_f32_e32 v4, v1
30119; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30120; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v2|
30121; GFX11-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v0|
30122; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
30123; GFX11-NEXT:    v_mul_f32_e64 v8, 0x2f800000, |v3|
30124; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
30125; GFX11-NEXT:    v_mul_f32_e64 v9, 0x2f800000, |v4|
30126; GFX11-NEXT:    v_floor_f32_e32 v1, v1
30127; GFX11-NEXT:    v_floor_f32_e32 v6, v6
30128; GFX11-NEXT:    v_floor_f32_e32 v8, v8
30129; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
30130; GFX11-NEXT:    v_floor_f32_e32 v9, v9
30131; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v2|
30132; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v6, |v0|
30133; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
30134; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
30135; GFX11-NEXT:    v_fma_f32 v3, 0xcf800000, v8, |v3|
30136; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
30137; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
30138; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
30139; GFX11-NEXT:    v_fma_f32 v11, 0xcf800000, v9, |v4|
30140; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
30141; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
30142; GFX11-NEXT:    v_cvt_u32_f32_e32 v12, v3
30143; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v7
30144; GFX11-NEXT:    v_cvt_u32_f32_e32 v8, v8
30145; GFX11-NEXT:    v_xor_b32_e32 v6, v6, v7
30146; GFX11-NEXT:    v_cvt_u32_f32_e32 v11, v11
30147; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
30148; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
30149; GFX11-NEXT:    v_cvt_u32_f32_e32 v9, v9
30150; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30151; GFX11-NEXT:    v_xor_b32_e32 v4, v12, v10
30152; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, v7
30153; GFX11-NEXT:    v_xor_b32_e32 v5, v8, v10
30154; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30155; GFX11-NEXT:    v_xor_b32_e32 v6, v11, v13
30156; GFX11-NEXT:    v_xor_b32_e32 v7, v9, v13
30157; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v10
30158; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30159; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30160; GFX11-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, v13
30161; GFX11-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30162; GFX11-NEXT:    s_setpc_b64 s[30:31]
30163  %op = fptosi <4 x bfloat> %x to <4 x i64>
30164  ret <4 x i64> %op
30165}
30166
30167define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
30168; GCN-LABEL: v_sitofp_i16_to_bf16:
30169; GCN:       ; %bb.0:
30170; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30171; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
30172; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30173; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30174; GCN-NEXT:    s_setpc_b64 s[30:31]
30175;
30176; GFX7-LABEL: v_sitofp_i16_to_bf16:
30177; GFX7:       ; %bb.0:
30178; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30179; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
30180; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30181; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30182; GFX7-NEXT:    s_setpc_b64 s[30:31]
30183;
30184; GFX8-LABEL: v_sitofp_i16_to_bf16:
30185; GFX8:       ; %bb.0:
30186; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30187; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30188; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
30189; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
30190; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
30191; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
30192; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30193; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
30194; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30195; GFX8-NEXT:    s_setpc_b64 s[30:31]
30196;
30197; GFX9-LABEL: v_sitofp_i16_to_bf16:
30198; GFX9:       ; %bb.0:
30199; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30200; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30201; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30202; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
30203; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30204; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
30205; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30206; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
30207; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30208; GFX9-NEXT:    s_setpc_b64 s[30:31]
30209;
30210; GFX10-LABEL: v_sitofp_i16_to_bf16:
30211; GFX10:       ; %bb.0:
30212; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30213; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30214; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
30215; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30216; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30217; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
30218; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30219; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30220; GFX10-NEXT:    s_setpc_b64 s[30:31]
30221;
30222; GFX11-LABEL: v_sitofp_i16_to_bf16:
30223; GFX11:       ; %bb.0:
30224; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30225; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
30226; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30227; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
30228; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
30229; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30230; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
30232; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
30233; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30234; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30235; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30236; GFX11-NEXT:    s_setpc_b64 s[30:31]
30237  %op = sitofp i16 %x to bfloat
30238  ret bfloat %op
30239}
30240
30241define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
30242; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
30243; GCN:       ; %bb.0:
30244; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30245; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
30246; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
30247; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
30248; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30249; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30250; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30251; GCN-NEXT:    s_setpc_b64 s[30:31]
30252;
30253; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
30254; GFX7:       ; %bb.0:
30255; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30256; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
30257; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
30258; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30259; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
30260; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30261; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30262; GFX7-NEXT:    s_setpc_b64 s[30:31]
30263;
30264; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
30265; GFX8:       ; %bb.0:
30266; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30267; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30268; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30269; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
30270; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
30271; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30272; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
30273; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30274; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
30275; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
30276; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
30277; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
30278; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
30279; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30280; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
30281; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30282; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
30283; GFX8-NEXT:    s_setpc_b64 s[30:31]
30284;
30285; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
30286; GFX9:       ; %bb.0:
30287; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30288; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30289; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30290; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30291; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
30292; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
30293; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
30294; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30295; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
30296; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
30297; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
30298; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
30299; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30300; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
30301; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
30302; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
30303; GFX9-NEXT:    s_setpc_b64 s[30:31]
30304;
30305; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
30306; GFX10:       ; %bb.0:
30307; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30308; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30309; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30310; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
30311; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
30312; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
30313; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30314; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
30315; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
30316; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
30317; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30318; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30319; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30320; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
30321; GFX10-NEXT:    s_setpc_b64 s[30:31]
30322;
30323; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
30324; GFX11:       ; %bb.0:
30325; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30326; GFX11-NEXT:    v_bfe_i32 v1, v0, 0, 16
30327; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
30328; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30329; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
30330; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
30331; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30332; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
30333; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
30334; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
30335; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30336; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
30337; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
30338; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
30339; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30340; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30341; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30342; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30344; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
30345; GFX11-NEXT:    s_setpc_b64 s[30:31]
30346  %op = sitofp <2 x i16> %x to <2 x bfloat>
30347  ret <2 x bfloat> %op
30348}
30349
30350define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
30351; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
30352; GCN:       ; %bb.0:
30353; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30354; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
30355; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
30356; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
30357; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
30358; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
30359; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30360; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30361; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30362; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30363; GCN-NEXT:    s_setpc_b64 s[30:31]
30364;
30365; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
30366; GFX7:       ; %bb.0:
30367; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30368; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
30369; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
30370; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
30371; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30372; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
30373; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
30374; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30375; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30376; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30377; GFX7-NEXT:    s_setpc_b64 s[30:31]
30378;
30379; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
30380; GFX8:       ; %bb.0:
30381; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30382; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30383; GFX8-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30384; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30385; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
30386; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
30387; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30388; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
30389; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30390; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
30391; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 1
30392; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
30393; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30394; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
30395; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
30396; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
30397; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
30398; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
30399; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30400; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
30401; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30402; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
30403; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30404; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
30405; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
30406; GFX8-NEXT:    s_setpc_b64 s[30:31]
30407;
30408; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
30409; GFX9:       ; %bb.0:
30410; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30411; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30412; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30413; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30414; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30415; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
30416; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
30417; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
30418; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30419; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
30420; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
30421; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
30422; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
30423; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
30424; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
30425; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
30426; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
30427; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
30428; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30429; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
30430; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
30431; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
30432; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
30433; GFX9-NEXT:    s_setpc_b64 s[30:31]
30434;
30435; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
30436; GFX10:       ; %bb.0:
30437; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30438; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30439; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30440; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30441; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
30442; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
30443; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
30444; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30445; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
30446; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
30447; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
30448; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
30449; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
30450; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
30451; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30452; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30453; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30454; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30455; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
30456; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30457; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
30458; GFX10-NEXT:    s_setpc_b64 s[30:31]
30459;
30460; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30461; GFX11TRUE16:       ; %bb.0:
30462; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30463; GFX11TRUE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
30464; GFX11TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
30465; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
30466; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30467; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
30468; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
30469; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30470; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
30471; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
30472; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30473; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
30474; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
30475; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
30476; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30477; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
30478; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
30479; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
30480; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
30481; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
30482; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30483; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30484; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30485; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30486; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30487; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30488; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
30489; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30490; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
30491; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
30492;
30493; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30494; GFX11FAKE16:       ; %bb.0:
30495; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30496; GFX11FAKE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
30497; GFX11FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
30498; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
30499; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30500; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
30501; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
30502; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30503; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
30504; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
30505; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30506; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
30507; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
30508; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
30509; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30510; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
30511; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
30512; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
30513; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
30514; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
30515; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30516; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30517; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30518; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30519; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30520; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30521; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
30522; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
30523; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
30524; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
30525  %op = sitofp <3 x i16> %x to <3 x bfloat>
30526  ret <3 x bfloat> %op
30527}
30528
30529define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
30530; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
30531; GCN:       ; %bb.0:
30532; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30533; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
30534; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
30535; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
30536; GCN-NEXT:    v_bfe_i32 v3, v3, 0, 16
30537; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
30538; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
30539; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
30540; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30541; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30542; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30543; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30544; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
30545; GCN-NEXT:    s_setpc_b64 s[30:31]
30546;
30547; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
30548; GFX7:       ; %bb.0:
30549; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30550; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
30551; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
30552; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
30553; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
30554; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30555; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
30556; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
30557; GFX7-NEXT:    v_cvt_f32_i32_e32 v3, v3
30558; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30559; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30560; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30561; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
30562; GFX7-NEXT:    s_setpc_b64 s[30:31]
30563;
30564; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
30565; GFX8:       ; %bb.0:
30566; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30567; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30568; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30569; GFX8-NEXT:    v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30570; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
30571; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
30572; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
30573; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
30574; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
30575; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
30576; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
30577; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
30578; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
30579; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
30580; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
30581; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30582; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
30583; GFX8-NEXT:    v_bfe_u32 v4, v5, 16, 1
30584; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30585; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
30586; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
30587; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v5
30588; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
30589; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
30590; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
30591; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
30592; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
30593; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
30594; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30595; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
30596; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
30597; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30598; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
30599; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
30600; GFX8-NEXT:    s_setpc_b64 s[30:31]
30601;
30602; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
30603; GFX9:       ; %bb.0:
30604; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30605; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30606; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30607; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30608; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
30609; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
30610; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
30611; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
30612; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
30613; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30614; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30615; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
30616; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
30617; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
30618; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30619; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
30620; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
30621; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
30622; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
30623; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
30624; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
30625; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
30626; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
30627; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
30628; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30629; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
30630; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
30631; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
30632; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
30633; GFX9-NEXT:    s_setpc_b64 s[30:31]
30634;
30635; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
30636; GFX10:       ; %bb.0:
30637; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30638; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30639; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30640; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30641; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30642; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
30643; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
30644; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
30645; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30646; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
30647; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
30648; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
30649; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
30650; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
30651; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
30652; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30653; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
30654; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
30655; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
30656; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
30657; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30658; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30659; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30660; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30661; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
30662; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30663; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
30664; GFX10-NEXT:    s_setpc_b64 s[30:31]
30665;
30666; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
30667; GFX11:       ; %bb.0:
30668; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30669; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
30670; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
30671; GFX11-NEXT:    v_bfe_i32 v3, v0, 0, 16
30672; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
30673; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30674; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
30675; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
30676; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30677; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
30678; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
30679; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
30680; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
30681; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
30682; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
30683; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
30684; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30685; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
30686; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
30687; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
30688; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
30689; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
30690; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
30691; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30692; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
30693; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
30694; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
30695; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30696; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30697; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
30698; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30699; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30700; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
30701; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30702; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30703; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
30704; GFX11-NEXT:    s_setpc_b64 s[30:31]
30705  %op = sitofp <4 x i16> %x to <4 x bfloat>
30706  ret <4 x bfloat> %op
30707}
30708
30709define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
30710; GCN-LABEL: v_sitofp_i32_to_bf16:
30711; GCN:       ; %bb.0:
30712; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30713; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30714; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30715; GCN-NEXT:    s_setpc_b64 s[30:31]
30716;
30717; GFX7-LABEL: v_sitofp_i32_to_bf16:
30718; GFX7:       ; %bb.0:
30719; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30720; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30721; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30722; GFX7-NEXT:    s_setpc_b64 s[30:31]
30723;
30724; GFX8-LABEL: v_sitofp_i32_to_bf16:
30725; GFX8:       ; %bb.0:
30726; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30727; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
30728; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
30729; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
30730; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
30731; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
30732; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30733; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
30734; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30735; GFX8-NEXT:    s_setpc_b64 s[30:31]
30736;
30737; GFX9-LABEL: v_sitofp_i32_to_bf16:
30738; GFX9:       ; %bb.0:
30739; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30740; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
30741; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30742; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
30743; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30744; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
30745; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30746; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
30747; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30748; GFX9-NEXT:    s_setpc_b64 s[30:31]
30749;
30750; GFX10-LABEL: v_sitofp_i32_to_bf16:
30751; GFX10:       ; %bb.0:
30752; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30753; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
30754; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
30755; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30756; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30757; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
30758; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30759; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30760; GFX10-NEXT:    s_setpc_b64 s[30:31]
30761;
30762; GFX11-LABEL: v_sitofp_i32_to_bf16:
30763; GFX11:       ; %bb.0:
30764; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30765; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
30766; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30767; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
30768; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30769; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30770; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
30771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30772; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30773; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
30774; GFX11-NEXT:    s_setpc_b64 s[30:31]
30775  %op = sitofp i32 %x to bfloat
30776  ret bfloat %op
30777}
30778
30779define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
30780; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
30781; GCN:       ; %bb.0:
30782; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30783; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
30784; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30785; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30786; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30787; GCN-NEXT:    s_setpc_b64 s[30:31]
30788;
30789; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
30790; GFX7:       ; %bb.0:
30791; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30792; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30793; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
30794; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30795; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30796; GFX7-NEXT:    s_setpc_b64 s[30:31]
30797;
30798; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
30799; GFX8:       ; %bb.0:
30800; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30801; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
30802; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
30803; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
30804; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
30805; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30806; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
30807; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30808; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
30809; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
30810; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
30811; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
30812; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
30813; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30814; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
30815; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
30816; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
30817; GFX8-NEXT:    s_setpc_b64 s[30:31]
30818;
30819; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
30820; GFX9:       ; %bb.0:
30821; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30822; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
30823; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
30824; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30825; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
30826; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
30827; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
30828; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30829; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
30830; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
30831; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
30832; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
30833; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30834; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
30835; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
30836; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
30837; GFX9-NEXT:    s_setpc_b64 s[30:31]
30838;
30839; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
30840; GFX10:       ; %bb.0:
30841; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30842; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
30843; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
30844; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
30845; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
30846; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
30847; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30848; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
30849; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
30850; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
30851; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30852; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30853; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30854; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
30855; GFX10-NEXT:    s_setpc_b64 s[30:31]
30856;
30857; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
30858; GFX11:       ; %bb.0:
30859; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30860; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
30861; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
30862; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30863; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
30864; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
30865; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
30866; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30867; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
30868; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
30869; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
30870; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30871; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30872; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30873; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30874; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30875; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
30876; GFX11-NEXT:    s_setpc_b64 s[30:31]
30877  %op = sitofp <2 x i32> %x to <2 x bfloat>
30878  ret <2 x bfloat> %op
30879}
30880
30881define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
30882; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
30883; GCN:       ; %bb.0:
30884; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30885; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
30886; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
30887; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
30888; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30889; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30890; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30891; GCN-NEXT:    s_setpc_b64 s[30:31]
30892;
30893; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
30894; GFX7:       ; %bb.0:
30895; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30896; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
30897; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
30898; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
30899; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
30900; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
30901; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
30902; GFX7-NEXT:    s_setpc_b64 s[30:31]
30903;
30904; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
30905; GFX8:       ; %bb.0:
30906; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30907; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
30908; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
30909; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
30910; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
30911; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
30912; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
30913; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
30914; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
30915; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
30916; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
30917; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
30918; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
30919; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
30920; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30921; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
30922; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
30923; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
30924; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
30925; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
30926; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30927; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
30928; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
30929; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
30930; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
30931; GFX8-NEXT:    v_mov_b32_e32 v1, v2
30932; GFX8-NEXT:    s_setpc_b64 s[30:31]
30933;
30934; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
30935; GFX9:       ; %bb.0:
30936; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30937; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
30938; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
30939; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
30940; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
30941; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
30942; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
30943; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
30944; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
30945; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
30946; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
30947; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
30948; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
30949; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
30950; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
30951; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
30952; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
30953; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
30954; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
30955; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
30956; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
30957; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
30958; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
30959; GFX9-NEXT:    s_setpc_b64 s[30:31]
30960;
30961; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
30962; GFX10:       ; %bb.0:
30963; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30964; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
30965; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
30966; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
30967; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
30968; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
30969; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
30970; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30971; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
30972; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
30973; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
30974; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
30975; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
30976; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
30977; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
30978; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
30979; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
30980; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
30981; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
30982; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
30983; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
30984; GFX10-NEXT:    s_setpc_b64 s[30:31]
30985;
30986; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
30987; GFX11TRUE16:       ; %bb.0:
30988; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30989; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
30990; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
30991; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
30992; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30993; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
30994; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
30995; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
30996; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
30997; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
30998; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
30999; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
31000; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
31001; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
31002; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
31003; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31004; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31005; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31006; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31007; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31008; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31009; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31010; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
31011; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
31012;
31013; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
31014; GFX11FAKE16:       ; %bb.0:
31015; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31016; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
31017; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
31018; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
31019; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31020; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
31021; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
31022; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
31023; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31024; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
31025; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
31026; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
31027; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
31028; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
31029; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
31030; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31031; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31032; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31033; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31034; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31035; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31036; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31037; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
31038; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
31039  %op = sitofp <3 x i32> %x to <3 x bfloat>
31040  ret <3 x bfloat> %op
31041}
31042
31043define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
31044; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
31045; GCN:       ; %bb.0:
31046; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31047; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
31048; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
31049; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
31050; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
31051; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31052; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31053; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
31054; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
31055; GCN-NEXT:    s_setpc_b64 s[30:31]
31056;
31057; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
31058; GFX7:       ; %bb.0:
31059; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31060; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
31061; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
31062; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
31063; GFX7-NEXT:    v_cvt_f32_i32_e32 v3, v3
31064; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31065; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31066; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
31067; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
31068; GFX7-NEXT:    s_setpc_b64 s[30:31]
31069;
31070; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
31071; GFX8:       ; %bb.0:
31072; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31073; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
31074; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
31075; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
31076; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
31077; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
31078; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
31079; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
31080; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
31081; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
31082; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
31083; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
31084; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
31085; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
31086; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
31087; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
31088; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
31089; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
31090; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
31091; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
31092; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
31093; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
31094; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31095; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
31096; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
31097; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
31098; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
31099; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
31100; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
31101; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
31102; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
31103; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
31104; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
31105; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
31106; GFX8-NEXT:    s_setpc_b64 s[30:31]
31107;
31108; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
31109; GFX9:       ; %bb.0:
31110; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31111; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
31112; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
31113; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
31114; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31115; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
31116; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
31117; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
31118; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
31119; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
31120; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
31121; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
31122; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
31123; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
31124; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
31125; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
31126; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
31127; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
31128; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
31129; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31130; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
31131; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
31132; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
31133; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
31134; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
31135; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
31136; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
31137; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
31138; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
31139; GFX9-NEXT:    s_setpc_b64 s[30:31]
31140;
31141; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
31142; GFX10:       ; %bb.0:
31143; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31144; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
31145; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
31146; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
31147; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
31148; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
31149; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
31150; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
31151; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31152; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
31153; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
31154; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
31155; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
31156; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
31157; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
31158; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31159; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31160; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
31161; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
31162; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
31163; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31164; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31165; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31166; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
31167; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31168; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31169; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
31170; GFX10-NEXT:    s_setpc_b64 s[30:31]
31171;
31172; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
31173; GFX11:       ; %bb.0:
31174; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31175; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
31176; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
31177; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
31178; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
31179; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
31180; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
31181; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
31182; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
31183; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
31184; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31185; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
31186; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
31187; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
31188; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
31189; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
31190; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31191; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31192; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
31193; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
31194; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31195; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31196; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
31197; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
31198; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31199; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
31200; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31201; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
31202; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31203; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
31204; GFX11-NEXT:    s_setpc_b64 s[30:31]
31205  %op = sitofp <4 x i32> %x to <4 x bfloat>
31206  ret <4 x bfloat> %op
31207}
31208
31209define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
31210; GCN-LABEL: v_sitofp_i64_to_bf16:
31211; GCN:       ; %bb.0:
31212; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31213; GCN-NEXT:    v_xor_b32_e32 v2, v0, v1
31214; GCN-NEXT:    v_ffbh_i32_e32 v3, v1
31215; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31216; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
31217; GCN-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
31218; GCN-NEXT:    v_min_u32_e32 v2, v3, v2
31219; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
31220; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
31221; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
31222; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
31223; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
31224; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
31225; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31226; GCN-NEXT:    s_setpc_b64 s[30:31]
31227;
31228; GFX7-LABEL: v_sitofp_i64_to_bf16:
31229; GFX7:       ; %bb.0:
31230; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31231; GFX7-NEXT:    v_xor_b32_e32 v2, v0, v1
31232; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31233; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
31234; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
31235; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
31236; GFX7-NEXT:    v_min_u32_e32 v2, v3, v2
31237; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
31238; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
31239; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
31240; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
31241; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
31242; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
31243; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31244; GFX7-NEXT:    s_setpc_b64 s[30:31]
31245;
31246; GFX8-LABEL: v_sitofp_i64_to_bf16:
31247; GFX8:       ; %bb.0:
31248; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31249; GFX8-NEXT:    v_xor_b32_e32 v2, v0, v1
31250; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31251; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
31252; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
31253; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
31254; GFX8-NEXT:    v_min_u32_e32 v2, v3, v2
31255; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
31256; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
31257; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
31258; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
31259; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
31260; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
31261; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
31262; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
31263; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
31264; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
31265; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31266; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
31267; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
31268; GFX8-NEXT:    s_setpc_b64 s[30:31]
31269;
31270; GFX9-LABEL: v_sitofp_i64_to_bf16:
31271; GFX9:       ; %bb.0:
31272; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31273; GFX9-NEXT:    v_xor_b32_e32 v2, v0, v1
31274; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31275; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
31276; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
31277; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
31278; GFX9-NEXT:    v_min_u32_e32 v2, v3, v2
31279; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
31280; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
31281; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
31282; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
31283; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31284; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
31285; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
31286; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
31287; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
31288; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
31289; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31290; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
31291; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
31292; GFX9-NEXT:    s_setpc_b64 s[30:31]
31293;
31294; GFX10-LABEL: v_sitofp_i64_to_bf16:
31295; GFX10:       ; %bb.0:
31296; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31297; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
31298; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
31299; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31300; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
31301; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
31302; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
31303; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
31304; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
31305; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
31306; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
31307; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
31308; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
31309; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
31310; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
31311; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31312; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
31313; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31314; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
31315; GFX10-NEXT:    s_setpc_b64 s[30:31]
31316;
31317; GFX11-LABEL: v_sitofp_i64_to_bf16:
31318; GFX11:       ; %bb.0:
31319; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31320; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
31321; GFX11-NEXT:    v_cls_i32_e32 v3, v1
31322; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31323; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
31324; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
31325; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
31326; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
31327; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
31328; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31329; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
31330; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
31331; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31332; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
31333; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
31334; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
31335; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31336; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
31337; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
31338; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
31339; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
31341; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
31342; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
31344; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
31345; GFX11-NEXT:    s_setpc_b64 s[30:31]
31346  %op = sitofp i64 %x to bfloat
31347  ret bfloat %op
31348}
31349
31350define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
31351; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
31352; GCN:       ; %bb.0:
31353; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31354; GCN-NEXT:    v_ffbh_i32_e32 v4, v3
31355; GCN-NEXT:    v_xor_b32_e32 v5, v2, v3
31356; GCN-NEXT:    v_ffbh_i32_e32 v6, v1
31357; GCN-NEXT:    v_xor_b32_e32 v7, v0, v1
31358; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
31359; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31360; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
31361; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31362; GCN-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
31363; GCN-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
31364; GCN-NEXT:    v_min_u32_e32 v4, v4, v5
31365; GCN-NEXT:    v_min_u32_e32 v5, v6, v7
31366; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
31367; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
31368; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
31369; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 32, v5
31370; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
31371; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
31372; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
31373; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
31374; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v2
31375; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
31376; GCN-NEXT:    v_ldexp_f32_e32 v1, v1, v4
31377; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v5
31378; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31379; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31380; GCN-NEXT:    s_setpc_b64 s[30:31]
31381;
31382; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
31383; GFX7:       ; %bb.0:
31384; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31385; GFX7-NEXT:    v_xor_b32_e32 v5, v2, v3
31386; GFX7-NEXT:    v_ffbh_i32_e32 v4, v3
31387; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31388; GFX7-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
31389; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
31390; GFX7-NEXT:    v_min_u32_e32 v4, v4, v5
31391; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
31392; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
31393; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
31394; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
31395; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
31396; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31397; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
31398; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
31399; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
31400; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
31401; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
31402; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
31403; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
31404; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
31405; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
31406; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v4
31407; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
31408; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
31409; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31410; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31411; GFX7-NEXT:    s_setpc_b64 s[30:31]
31412;
31413; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
31414; GFX8:       ; %bb.0:
31415; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31416; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
31417; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
31418; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31419; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
31420; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
31421; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
31422; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
31423; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
31424; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
31425; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
31426; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
31427; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v4
31428; GFX8-NEXT:    v_ldexp_f32 v4, v0, v1
31429; GFX8-NEXT:    v_bfe_u32 v0, v4, 16, 1
31430; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
31431; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
31432; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v0
31433; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
31434; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
31435; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
31436; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
31437; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
31438; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
31439; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
31440; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
31441; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
31442; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
31443; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
31444; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
31445; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
31446; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
31447; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
31448; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
31449; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
31450; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
31451; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31452; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
31453; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
31454; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
31455; GFX8-NEXT:    s_setpc_b64 s[30:31]
31456;
31457; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
31458; GFX9:       ; %bb.0:
31459; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31460; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
31461; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
31462; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31463; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
31464; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
31465; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
31466; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
31467; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
31468; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
31469; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
31470; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31471; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
31472; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
31473; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
31474; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
31475; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
31476; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
31477; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
31478; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
31479; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
31480; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
31481; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
31482; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
31483; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
31484; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
31485; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31486; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
31487; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
31488; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
31489; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
31490; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
31491; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
31492; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
31493; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31494; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
31495; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
31496; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
31497; GFX9-NEXT:    s_setpc_b64 s[30:31]
31498;
31499; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
31500; GFX10:       ; %bb.0:
31501; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31502; GFX10-NEXT:    v_xor_b32_e32 v4, v0, v1
31503; GFX10-NEXT:    v_xor_b32_e32 v5, v2, v3
31504; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
31505; GFX10-NEXT:    v_ffbh_i32_e32 v7, v3
31506; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
31507; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31508; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
31509; GFX10-NEXT:    v_add_nc_u32_e32 v7, -1, v7
31510; GFX10-NEXT:    v_add_nc_u32_e32 v4, 32, v4
31511; GFX10-NEXT:    v_add_nc_u32_e32 v5, 32, v5
31512; GFX10-NEXT:    v_min_u32_e32 v4, v6, v4
31513; GFX10-NEXT:    v_min_u32_e32 v5, v7, v5
31514; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
31515; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
31516; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
31517; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
31518; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
31519; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
31520; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
31521; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
31522; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
31523; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
31524; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
31525; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
31526; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
31527; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
31528; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
31529; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31530; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
31531; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
31532; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
31533; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31534; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31535; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31536; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31537; GFX10-NEXT:    s_setpc_b64 s[30:31]
31538;
31539; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
31540; GFX11:       ; %bb.0:
31541; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31542; GFX11-NEXT:    v_xor_b32_e32 v4, v0, v1
31543; GFX11-NEXT:    v_xor_b32_e32 v5, v2, v3
31544; GFX11-NEXT:    v_cls_i32_e32 v6, v1
31545; GFX11-NEXT:    v_cls_i32_e32 v7, v3
31546; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31547; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
31548; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31549; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31550; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
31551; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
31552; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31553; GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v4
31554; GFX11-NEXT:    v_add_nc_u32_e32 v5, 32, v5
31555; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31556; GFX11-NEXT:    v_min_u32_e32 v4, v6, v4
31557; GFX11-NEXT:    v_min_u32_e32 v5, v7, v5
31558; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31559; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
31560; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
31561; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31562; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
31563; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
31564; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31565; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
31566; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
31567; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
31568; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
31569; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31570; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
31571; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
31572; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31573; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
31574; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
31575; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31576; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
31577; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
31578; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
31579; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31580; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
31581; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
31582; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
31583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
31584; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31585; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31586; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31587; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
31588; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
31589; GFX11-NEXT:    s_setpc_b64 s[30:31]
31590  %op = sitofp <2 x i64> %x to <2 x bfloat>
31591  ret <2 x bfloat> %op
31592}
31593
31594define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
31595; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
31596; GCN:       ; %bb.0:
31597; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31598; GCN-NEXT:    v_ffbh_i32_e32 v6, v5
31599; GCN-NEXT:    v_xor_b32_e32 v7, v4, v5
31600; GCN-NEXT:    v_ffbh_i32_e32 v8, v3
31601; GCN-NEXT:    v_xor_b32_e32 v9, v2, v3
31602; GCN-NEXT:    v_ffbh_i32_e32 v10, v1
31603; GCN-NEXT:    v_xor_b32_e32 v11, v0, v1
31604; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
31605; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31606; GCN-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
31607; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
31608; GCN-NEXT:    v_add_i32_e32 v10, vcc, -1, v10
31609; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
31610; GCN-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
31611; GCN-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
31612; GCN-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
31613; GCN-NEXT:    v_min_u32_e32 v6, v6, v7
31614; GCN-NEXT:    v_min_u32_e32 v7, v8, v9
31615; GCN-NEXT:    v_min_u32_e32 v8, v10, v11
31616; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
31617; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 32, v6
31618; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
31619; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 32, v7
31620; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
31621; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
31622; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
31623; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
31624; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
31625; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
31626; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
31627; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
31628; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v4
31629; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
31630; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
31631; GCN-NEXT:    v_ldexp_f32_e32 v3, v1, v6
31632; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v7
31633; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v8
31634; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31635; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31636; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
31637; GCN-NEXT:    s_setpc_b64 s[30:31]
31638;
31639; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
31640; GFX7:       ; %bb.0:
31641; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31642; GFX7-NEXT:    v_xor_b32_e32 v7, v4, v5
31643; GFX7-NEXT:    v_ffbh_i32_e32 v6, v5
31644; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31645; GFX7-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
31646; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
31647; GFX7-NEXT:    v_min_u32_e32 v6, v6, v7
31648; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
31649; GFX7-NEXT:    v_xor_b32_e32 v7, v2, v3
31650; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
31651; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
31652; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
31653; GFX7-NEXT:    v_ffbh_i32_e32 v6, v3
31654; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31655; GFX7-NEXT:    v_cvt_f32_i32_e32 v4, v4
31656; GFX7-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
31657; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
31658; GFX7-NEXT:    v_min_u32_e32 v6, v6, v7
31659; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
31660; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
31661; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
31662; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
31663; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
31664; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
31665; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
31666; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
31667; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
31668; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
31669; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
31670; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
31671; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
31672; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
31673; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
31674; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
31675; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
31676; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
31677; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
31678; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
31679; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
31680; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
31681; GFX7-NEXT:    s_setpc_b64 s[30:31]
31682;
31683; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
31684; GFX8:       ; %bb.0:
31685; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31686; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
31687; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
31688; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31689; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
31690; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
31691; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
31692; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
31693; GFX8-NEXT:    v_xor_b32_e32 v8, v0, v1
31694; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
31695; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
31696; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
31697; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
31698; GFX8-NEXT:    v_ffbh_i32_e32 v7, v1
31699; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
31700; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
31701; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
31702; GFX8-NEXT:    v_add_u32_e32 v7, vcc, -1, v7
31703; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v8
31704; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
31705; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
31706; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
31707; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
31708; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
31709; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
31710; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
31711; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
31712; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
31713; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
31714; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
31715; GFX8-NEXT:    v_ffbh_i32_e32 v5, v3
31716; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
31717; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
31718; GFX8-NEXT:    v_add_u32_e32 v5, vcc, -1, v5
31719; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
31720; GFX8-NEXT:    v_min_u32_e32 v5, v5, v6
31721; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
31722; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
31723; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
31724; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
31725; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
31726; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
31727; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
31728; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
31729; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
31730; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
31731; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
31732; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31733; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
31734; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
31735; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
31736; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
31737; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
31738; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
31739; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
31740; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
31741; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
31742; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
31743; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
31744; GFX8-NEXT:    s_setpc_b64 s[30:31]
31745;
31746; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
31747; GFX9:       ; %bb.0:
31748; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31749; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
31750; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
31751; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31752; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
31753; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
31754; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
31755; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
31756; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
31757; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
31758; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
31759; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
31760; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
31761; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31762; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
31763; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
31764; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
31765; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
31766; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
31767; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
31768; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
31769; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31770; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
31771; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
31772; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
31773; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
31774; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
31775; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
31776; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
31777; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
31778; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
31779; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
31780; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
31781; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
31782; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
31783; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
31784; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
31785; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
31786; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
31787; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
31788; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
31789; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
31790; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
31791; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
31792; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
31793; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
31794; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
31795; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
31796; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
31797; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
31798; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
31799; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
31800; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
31801; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
31802; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
31803; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
31804; GFX9-NEXT:    s_setpc_b64 s[30:31]
31805;
31806; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
31807; GFX10:       ; %bb.0:
31808; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31809; GFX10-NEXT:    v_xor_b32_e32 v8, v0, v1
31810; GFX10-NEXT:    v_xor_b32_e32 v7, v4, v5
31811; GFX10-NEXT:    v_xor_b32_e32 v9, v2, v3
31812; GFX10-NEXT:    v_ffbh_i32_e32 v10, v1
31813; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
31814; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
31815; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31816; GFX10-NEXT:    v_ffbh_i32_e32 v11, v3
31817; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
31818; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
31819; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
31820; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
31821; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
31822; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
31823; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
31824; GFX10-NEXT:    v_min_u32_e32 v8, v10, v8
31825; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
31826; GFX10-NEXT:    v_min_u32_e32 v7, v11, v9
31827; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
31828; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
31829; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
31830; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
31831; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
31832; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
31833; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
31834; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
31835; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
31836; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
31837; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
31838; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
31839; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
31840; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
31841; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
31842; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
31843; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
31844; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
31845; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
31846; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
31847; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31848; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
31849; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
31850; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
31851; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
31852; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
31853; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
31854; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
31855; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31856; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31857; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31858; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31859; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
31860; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31861; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
31862; GFX10-NEXT:    s_setpc_b64 s[30:31]
31863;
31864; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31865; GFX11TRUE16:       ; %bb.0:
31866; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31867; GFX11TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
31868; GFX11TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
31869; GFX11TRUE16-NEXT:    v_xor_b32_e32 v9, v2, v3
31870; GFX11TRUE16-NEXT:    v_cls_i32_e32 v10, v1
31871; GFX11TRUE16-NEXT:    v_cls_i32_e32 v6, v5
31872; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
31873; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31874; GFX11TRUE16-NEXT:    v_cls_i32_e32 v11, v3
31875; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
31876; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
31877; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
31878; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
31879; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
31880; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
31881; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
31882; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, v10, v8
31883; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31884; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
31885; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, v11, v9
31886; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31887; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
31888; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
31889; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
31890; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31891; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
31892; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
31893; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31894; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
31895; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
31896; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31897; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
31898; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
31899; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
31900; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31901; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
31902; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
31903; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
31904; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
31905; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
31906; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31907; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
31908; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v6
31909; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31910; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
31911; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
31912; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
31913; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31914; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
31915; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
31916; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
31917; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
31918; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
31919; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
31920; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
31921; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
31922; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31923; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31924; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31925; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31926; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31927; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
31928; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31929; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
31930; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
31931; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
31932;
31933; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31934; GFX11FAKE16:       ; %bb.0:
31935; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31936; GFX11FAKE16-NEXT:    v_xor_b32_e32 v8, v0, v1
31937; GFX11FAKE16-NEXT:    v_xor_b32_e32 v7, v4, v5
31938; GFX11FAKE16-NEXT:    v_xor_b32_e32 v9, v2, v3
31939; GFX11FAKE16-NEXT:    v_cls_i32_e32 v10, v1
31940; GFX11FAKE16-NEXT:    v_cls_i32_e32 v6, v5
31941; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
31942; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
31943; GFX11FAKE16-NEXT:    v_cls_i32_e32 v11, v3
31944; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
31945; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
31946; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
31947; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
31948; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
31949; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
31950; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
31951; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, v10, v8
31952; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31953; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, v6, v7
31954; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, v11, v9
31955; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31956; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
31957; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
31958; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
31959; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31960; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
31961; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
31962; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31963; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
31964; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
31965; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31966; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
31967; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
31968; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
31969; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31970; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
31971; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
31972; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
31973; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
31974; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
31975; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31976; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
31977; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
31978; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31979; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
31980; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
31981; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
31982; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
31983; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
31984; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
31985; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
31986; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
31987; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
31988; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
31989; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
31990; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
31991; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31992; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
31993; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31994; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31995; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
31996; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
31997; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31998; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
31999; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
32000; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
32001  %op = sitofp <3 x i64> %x to <3 x bfloat>
32002  ret <3 x bfloat> %op
32003}
32004
32005define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
32006; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
32007; GCN:       ; %bb.0:
32008; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32009; GCN-NEXT:    v_ffbh_i32_e32 v8, v7
32010; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
32011; GCN-NEXT:    v_ffbh_i32_e32 v10, v5
32012; GCN-NEXT:    v_xor_b32_e32 v11, v4, v5
32013; GCN-NEXT:    v_ffbh_i32_e32 v12, v3
32014; GCN-NEXT:    v_xor_b32_e32 v13, v2, v3
32015; GCN-NEXT:    v_ffbh_i32_e32 v14, v1
32016; GCN-NEXT:    v_xor_b32_e32 v15, v0, v1
32017; GCN-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
32018; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32019; GCN-NEXT:    v_add_i32_e32 v10, vcc, -1, v10
32020; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
32021; GCN-NEXT:    v_add_i32_e32 v12, vcc, -1, v12
32022; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
32023; GCN-NEXT:    v_add_i32_e32 v14, vcc, -1, v14
32024; GCN-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
32025; GCN-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
32026; GCN-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
32027; GCN-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
32028; GCN-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
32029; GCN-NEXT:    v_min_u32_e32 v8, v8, v9
32030; GCN-NEXT:    v_min_u32_e32 v9, v10, v11
32031; GCN-NEXT:    v_min_u32_e32 v10, v12, v13
32032; GCN-NEXT:    v_min_u32_e32 v11, v14, v15
32033; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
32034; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
32035; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v9
32036; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 32, v9
32037; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
32038; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 32, v10
32039; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v11
32040; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 32, v11
32041; GCN-NEXT:    v_min_u32_e32 v6, 1, v6
32042; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
32043; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
32044; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
32045; GCN-NEXT:    v_or_b32_e32 v6, v7, v6
32046; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
32047; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
32048; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
32049; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v6
32050; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v4
32051; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
32052; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
32053; GCN-NEXT:    v_ldexp_f32_e32 v4, v1, v8
32054; GCN-NEXT:    v_ldexp_f32_e32 v3, v3, v9
32055; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v10
32056; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v11
32057; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
32058; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
32059; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
32060; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
32061; GCN-NEXT:    s_setpc_b64 s[30:31]
32062;
32063; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
32064; GFX7:       ; %bb.0:
32065; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32066; GFX7-NEXT:    v_xor_b32_e32 v9, v6, v7
32067; GFX7-NEXT:    v_ffbh_i32_e32 v8, v7
32068; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32069; GFX7-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
32070; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
32071; GFX7-NEXT:    v_min_u32_e32 v8, v8, v9
32072; GFX7-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
32073; GFX7-NEXT:    v_xor_b32_e32 v9, v4, v5
32074; GFX7-NEXT:    v_min_u32_e32 v6, 1, v6
32075; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
32076; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 32, v8
32077; GFX7-NEXT:    v_ffbh_i32_e32 v8, v5
32078; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32079; GFX7-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
32080; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
32081; GFX7-NEXT:    v_min_u32_e32 v8, v8, v9
32082; GFX7-NEXT:    v_cvt_f32_i32_e32 v6, v6
32083; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v8
32084; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
32085; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
32086; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v8
32087; GFX7-NEXT:    v_xor_b32_e32 v8, v2, v3
32088; GFX7-NEXT:    v_ldexp_f32_e32 v6, v6, v7
32089; GFX7-NEXT:    v_ffbh_i32_e32 v7, v3
32090; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
32091; GFX7-NEXT:    v_cvt_f32_i32_e32 v4, v4
32092; GFX7-NEXT:    v_add_i32_e32 v7, vcc, -1, v7
32093; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v8
32094; GFX7-NEXT:    v_min_u32_e32 v7, v7, v8
32095; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
32096; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
32097; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
32098; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
32099; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
32100; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
32101; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
32102; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
32103; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
32104; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
32105; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
32106; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
32107; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
32108; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
32109; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
32110; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v7
32111; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
32112; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
32113; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
32114; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
32115; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
32116; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
32117; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
32118; GFX7-NEXT:    s_setpc_b64 s[30:31]
32119;
32120; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
32121; GFX8:       ; %bb.0:
32122; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32123; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
32124; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
32125; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32126; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
32127; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
32128; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
32129; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
32130; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
32131; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
32132; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
32133; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
32134; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
32135; GFX8-NEXT:    v_ldexp_f32 v8, v4, v5
32136; GFX8-NEXT:    v_bfe_u32 v4, v8, 16, 1
32137; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
32138; GFX8-NEXT:    v_xor_b32_e32 v5, v6, v7
32139; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v4
32140; GFX8-NEXT:    v_ffbh_i32_e32 v4, v7
32141; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
32142; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
32143; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
32144; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
32145; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
32146; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
32147; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
32148; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
32149; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
32150; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
32151; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
32152; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
32153; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32154; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
32155; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
32156; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
32157; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
32158; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
32159; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
32160; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
32161; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
32162; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
32163; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
32164; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
32165; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
32166; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
32167; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
32168; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
32169; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
32170; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
32171; GFX8-NEXT:    v_ldexp_f32 v6, v0, v1
32172; GFX8-NEXT:    v_bfe_u32 v0, v6, 16, 1
32173; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
32174; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
32175; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v0
32176; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
32177; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
32178; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
32179; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
32180; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
32181; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
32182; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
32183; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
32184; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
32185; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
32186; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
32187; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
32188; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
32189; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
32190; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
32191; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
32192; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
32193; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
32194; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32195; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
32196; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
32197; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32198; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
32199; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
32200; GFX8-NEXT:    s_setpc_b64 s[30:31]
32201;
32202; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
32203; GFX9:       ; %bb.0:
32204; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32205; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
32206; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
32207; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
32208; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
32209; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
32210; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
32211; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
32212; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
32213; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
32214; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
32215; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
32216; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
32217; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
32218; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
32219; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
32220; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
32221; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
32222; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
32223; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
32224; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
32225; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
32226; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
32227; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
32228; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
32229; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
32230; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
32231; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
32232; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
32233; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
32234; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
32235; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
32236; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
32237; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
32238; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
32239; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
32240; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
32241; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
32242; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
32243; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
32244; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
32245; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
32246; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
32247; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
32248; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
32249; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
32250; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
32251; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
32252; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
32253; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
32254; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
32255; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
32256; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
32257; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
32258; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
32259; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
32260; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
32261; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
32262; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
32263; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
32264; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
32265; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
32266; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
32267; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
32268; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
32269; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
32270; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
32271; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
32272; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32273; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
32274; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
32275; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
32276; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
32277; GFX9-NEXT:    s_setpc_b64 s[30:31]
32278;
32279; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
32280; GFX10:       ; %bb.0:
32281; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32282; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
32283; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
32284; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
32285; GFX10-NEXT:    v_xor_b32_e32 v13, v0, v1
32286; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
32287; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
32288; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
32289; GFX10-NEXT:    v_ffbh_i32_e32 v12, v1
32290; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v3
32291; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
32292; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
32293; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
32294; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
32295; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
32296; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
32297; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
32298; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
32299; GFX10-NEXT:    v_ffbh_i32_e32 v13, v3
32300; GFX10-NEXT:    v_add_nc_u32_e32 v14, 32, v14
32301; GFX10-NEXT:    v_min_u32_e32 v10, v10, v11
32302; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
32303; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
32304; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v13
32305; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
32306; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
32307; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
32308; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
32309; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
32310; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
32311; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
32312; GFX10-NEXT:    v_min_u32_e32 v5, 1, v6
32313; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
32314; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
32315; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
32316; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
32317; GFX10-NEXT:    v_or_b32_e32 v5, v7, v5
32318; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
32319; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
32320; GFX10-NEXT:    v_ldexp_f32 v2, v4, v6
32321; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v5
32322; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v10
32323; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
32324; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
32325; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
32326; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
32327; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
32328; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
32329; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
32330; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
32331; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
32332; GFX10-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
32333; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
32334; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
32335; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32336; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
32337; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
32338; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32339; GFX10-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
32340; GFX10-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
32341; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
32342; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32343; GFX10-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
32344; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
32345; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32346; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32347; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32348; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
32349; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
32350; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32351; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
32352; GFX10-NEXT:    s_setpc_b64 s[30:31]
32353;
32354; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
32355; GFX11:       ; %bb.0:
32356; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32357; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
32358; GFX11-NEXT:    v_cls_i32_e32 v9, v5
32359; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
32360; GFX11-NEXT:    v_xor_b32_e32 v13, v0, v1
32361; GFX11-NEXT:    v_cls_i32_e32 v10, v7
32362; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
32363; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
32364; GFX11-NEXT:    v_cls_i32_e32 v12, v1
32365; GFX11-NEXT:    v_xor_b32_e32 v14, v2, v3
32366; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
32367; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
32368; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
32369; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
32370; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
32371; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
32372; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
32373; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
32374; GFX11-NEXT:    v_cls_i32_e32 v13, v3
32375; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
32376; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
32377; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
32378; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
32379; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
32380; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
32381; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
32382; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
32383; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32384; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
32385; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
32386; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
32387; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
32388; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
32389; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
32390; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
32391; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
32392; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
32393; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
32394; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
32395; GFX11-NEXT:    v_or_b32_e32 v5, v7, v5
32396; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32397; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
32398; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
32399; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32400; GFX11-NEXT:    v_ldexp_f32 v2, v4, v6
32401; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v5
32402; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v10
32403; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
32404; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
32405; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
32406; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
32407; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
32408; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
32409; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
32410; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
32411; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
32412; GFX11-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
32413; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
32414; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
32415; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32416; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
32417; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
32418; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32419; GFX11-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
32420; GFX11-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
32421; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
32422; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32423; GFX11-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
32424; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
32425; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32426; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32427; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32428; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32429; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
32430; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32431; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
32432; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32433; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
32434; GFX11-NEXT:    s_setpc_b64 s[30:31]
32435  %op = sitofp <4 x i64> %x to <4 x bfloat>
32436  ret <4 x bfloat> %op
32437}
32438
32439define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
32440; GCN-LABEL: v_uitofp_i16_to_bf16:
32441; GCN:       ; %bb.0:
32442; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32443; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32444; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
32445; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32446; GCN-NEXT:    s_setpc_b64 s[30:31]
32447;
32448; GFX7-LABEL: v_uitofp_i16_to_bf16:
32449; GFX7:       ; %bb.0:
32450; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32451; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32452; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
32453; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32454; GFX7-NEXT:    s_setpc_b64 s[30:31]
32455;
32456; GFX8-LABEL: v_uitofp_i16_to_bf16:
32457; GFX8:       ; %bb.0:
32458; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32459; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32460; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
32461; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
32462; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
32463; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
32464; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32465; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
32466; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32467; GFX8-NEXT:    s_setpc_b64 s[30:31]
32468;
32469; GFX9-LABEL: v_uitofp_i16_to_bf16:
32470; GFX9:       ; %bb.0:
32471; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32472; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32473; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
32474; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
32475; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
32476; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
32477; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32478; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
32479; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32480; GFX9-NEXT:    s_setpc_b64 s[30:31]
32481;
32482; GFX10-LABEL: v_uitofp_i16_to_bf16:
32483; GFX10:       ; %bb.0:
32484; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32485; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32486; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
32487; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
32488; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32489; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
32490; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32491; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32492; GFX10-NEXT:    s_setpc_b64 s[30:31]
32493;
32494; GFX11-LABEL: v_uitofp_i16_to_bf16:
32495; GFX11:       ; %bb.0:
32496; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32497; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32498; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32499; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
32500; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
32501; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
32502; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32503; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
32504; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
32505; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32506; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
32507; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32508; GFX11-NEXT:    s_setpc_b64 s[30:31]
32509  %op = uitofp i16 %x to bfloat
32510  ret bfloat %op
32511}
32512
32513define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
32514; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
32515; GCN:       ; %bb.0:
32516; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32517; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32518; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32519; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
32520; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
32521; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32522; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32523; GCN-NEXT:    s_setpc_b64 s[30:31]
32524;
32525; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
32526; GFX7:       ; %bb.0:
32527; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32528; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32529; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32530; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
32531; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
32532; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32533; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32534; GFX7-NEXT:    s_setpc_b64 s[30:31]
32535;
32536; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
32537; GFX8:       ; %bb.0:
32538; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32539; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32540; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32541; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
32542; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
32543; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
32544; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
32545; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32546; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
32547; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
32548; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
32549; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
32550; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
32551; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32552; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
32553; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32554; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
32555; GFX8-NEXT:    s_setpc_b64 s[30:31]
32556;
32557; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
32558; GFX9:       ; %bb.0:
32559; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32560; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32561; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32562; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
32563; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
32564; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
32565; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
32566; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32567; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
32568; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
32569; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
32570; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
32571; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32572; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
32573; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
32574; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
32575; GFX9-NEXT:    s_setpc_b64 s[30:31]
32576;
32577; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
32578; GFX10:       ; %bb.0:
32579; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32580; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32581; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32582; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
32583; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
32584; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
32585; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32586; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
32587; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
32588; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
32589; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32590; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32591; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32592; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
32593; GFX10-NEXT:    s_setpc_b64 s[30:31]
32594;
32595; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
32596; GFX11:       ; %bb.0:
32597; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32598; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
32599; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32600; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32601; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
32602; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
32603; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32604; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
32605; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
32606; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
32607; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32608; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
32609; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
32610; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
32611; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32612; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32613; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32614; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32615; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
32616; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
32617; GFX11-NEXT:    s_setpc_b64 s[30:31]
32618  %op = uitofp <2 x i16> %x to <2 x bfloat>
32619  ret <2 x bfloat> %op
32620}
32621
32622define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
32623; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
32624; GCN:       ; %bb.0:
32625; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32626; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32627; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32628; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32629; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
32630; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
32631; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
32632; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32633; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32634; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
32635; GCN-NEXT:    s_setpc_b64 s[30:31]
32636;
32637; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
32638; GFX7:       ; %bb.0:
32639; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32640; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32641; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32642; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32643; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
32644; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
32645; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
32646; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32647; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32648; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
32649; GFX7-NEXT:    s_setpc_b64 s[30:31]
32650;
32651; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
32652; GFX8:       ; %bb.0:
32653; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32654; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32655; GFX8-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32656; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32657; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
32658; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
32659; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
32660; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
32661; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
32662; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32663; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
32664; GFX8-NEXT:    v_bfe_u32 v2, v4, 16, 1
32665; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
32666; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
32667; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v4
32668; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
32669; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
32670; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
32671; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
32672; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
32673; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
32674; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32675; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
32676; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32677; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
32678; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
32679; GFX8-NEXT:    s_setpc_b64 s[30:31]
32680;
32681; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
32682; GFX9:       ; %bb.0:
32683; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32684; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32685; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32686; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
32687; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32688; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
32689; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
32690; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
32691; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32692; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
32693; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
32694; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
32695; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
32696; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
32697; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
32698; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
32699; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
32700; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
32701; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32702; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
32703; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
32704; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
32705; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
32706; GFX9-NEXT:    s_setpc_b64 s[30:31]
32707;
32708; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
32709; GFX10:       ; %bb.0:
32710; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32711; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32712; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32713; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32714; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
32715; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
32716; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
32717; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32718; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
32719; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
32720; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
32721; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
32722; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
32723; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
32724; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32725; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32726; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32727; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32728; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
32729; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32730; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
32731; GFX10-NEXT:    s_setpc_b64 s[30:31]
32732;
32733; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32734; GFX11TRUE16:       ; %bb.0:
32735; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32736; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32737; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32738; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
32739; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
32740; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
32741; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32742; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
32743; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
32744; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32745; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
32746; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32747; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
32748; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
32749; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
32750; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
32751; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
32752; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32753; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
32754; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
32755; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
32756; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32757; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32758; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32759; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32760; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32761; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32762; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
32763; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32764; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
32765; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
32766;
32767; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32768; GFX11FAKE16:       ; %bb.0:
32769; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32770; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32771; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32772; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
32773; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
32774; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
32775; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32776; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
32777; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
32778; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32779; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
32780; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32781; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
32782; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
32783; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
32784; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
32785; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
32786; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32787; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
32788; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
32789; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
32790; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32791; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32792; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32793; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32794; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32795; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32796; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
32797; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32798; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
32799; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
32800  %op = uitofp <3 x i16> %x to <3 x bfloat>
32801  ret <3 x bfloat> %op
32802}
32803
32804define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
32805; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
32806; GCN:       ; %bb.0:
32807; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32808; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32809; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32810; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32811; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
32812; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
32813; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
32814; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
32815; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
32816; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32817; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32818; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
32819; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
32820; GCN-NEXT:    s_setpc_b64 s[30:31]
32821;
32822; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
32823; GFX7:       ; %bb.0:
32824; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32825; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
32826; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
32827; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32828; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
32829; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
32830; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
32831; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
32832; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
32833; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32834; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
32835; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
32836; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
32837; GFX7-NEXT:    s_setpc_b64 s[30:31]
32838;
32839; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
32840; GFX8:       ; %bb.0:
32841; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32842; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32843; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32844; GFX8-NEXT:    v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32845; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
32846; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
32847; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
32848; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
32849; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
32850; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
32851; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
32852; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
32853; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
32854; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
32855; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
32856; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32857; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
32858; GFX8-NEXT:    v_bfe_u32 v3, v5, 16, 1
32859; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32860; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
32861; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
32862; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v5
32863; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
32864; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
32865; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
32866; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
32867; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
32868; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
32869; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32870; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
32871; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
32872; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32873; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
32874; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
32875; GFX8-NEXT:    s_setpc_b64 s[30:31]
32876;
32877; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
32878; GFX9:       ; %bb.0:
32879; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32880; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32881; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
32882; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32883; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
32884; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
32885; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
32886; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
32887; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
32888; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32889; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32890; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
32891; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
32892; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
32893; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
32894; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
32895; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
32896; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
32897; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
32898; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
32899; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
32900; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
32901; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
32902; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
32903; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
32904; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
32905; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
32906; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
32907; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
32908; GFX9-NEXT:    s_setpc_b64 s[30:31]
32909;
32910; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
32911; GFX10:       ; %bb.0:
32912; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32913; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32914; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32915; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32916; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32917; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
32918; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
32919; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
32920; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32921; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
32922; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
32923; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
32924; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
32925; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
32926; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
32927; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32928; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
32929; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
32930; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
32931; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
32932; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32933; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32934; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32935; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32936; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
32937; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32938; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
32939; GFX10-NEXT:    s_setpc_b64 s[30:31]
32940;
32941; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
32942; GFX11:       ; %bb.0:
32943; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32944; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
32945; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
32946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32947; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
32948; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
32949; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
32950; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
32951; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
32952; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
32953; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
32954; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
32955; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
32956; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
32957; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
32958; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
32959; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32960; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32961; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
32962; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
32963; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
32964; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
32965; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
32966; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
32967; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
32968; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
32969; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
32970; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
32971; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
32972; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32973; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32974; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
32975; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32976; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
32977; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32978; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
32979; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32980; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
32981; GFX11-NEXT:    s_setpc_b64 s[30:31]
32982  %op = uitofp <4 x i16> %x to <4 x bfloat>
32983  ret <4 x bfloat> %op
32984}
32985
32986define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
32987; GCN-LABEL: v_uitofp_i32_to_bf16:
32988; GCN:       ; %bb.0:
32989; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32990; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
32991; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32992; GCN-NEXT:    s_setpc_b64 s[30:31]
32993;
32994; GFX7-LABEL: v_uitofp_i32_to_bf16:
32995; GFX7:       ; %bb.0:
32996; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32997; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
32998; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
32999; GFX7-NEXT:    s_setpc_b64 s[30:31]
33000;
33001; GFX8-LABEL: v_uitofp_i32_to_bf16:
33002; GFX8:       ; %bb.0:
33003; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33004; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33005; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
33006; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
33007; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
33008; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
33009; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33010; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
33011; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33012; GFX8-NEXT:    s_setpc_b64 s[30:31]
33013;
33014; GFX9-LABEL: v_uitofp_i32_to_bf16:
33015; GFX9:       ; %bb.0:
33016; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33017; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33018; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33019; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
33020; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33021; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
33022; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33023; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
33024; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33025; GFX9-NEXT:    s_setpc_b64 s[30:31]
33026;
33027; GFX10-LABEL: v_uitofp_i32_to_bf16:
33028; GFX10:       ; %bb.0:
33029; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33030; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33031; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
33032; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33033; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33034; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
33035; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33036; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33037; GFX10-NEXT:    s_setpc_b64 s[30:31]
33038;
33039; GFX11-LABEL: v_uitofp_i32_to_bf16:
33040; GFX11:       ; %bb.0:
33041; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33042; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
33043; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33044; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
33045; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33046; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33047; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
33048; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33049; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33050; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33051; GFX11-NEXT:    s_setpc_b64 s[30:31]
33052  %op = uitofp i32 %x to bfloat
33053  ret bfloat %op
33054}
33055
33056define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
33057; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
33058; GCN:       ; %bb.0:
33059; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33060; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
33061; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33062; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33063; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33064; GCN-NEXT:    s_setpc_b64 s[30:31]
33065;
33066; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
33067; GFX7:       ; %bb.0:
33068; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33069; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33070; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
33071; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33072; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33073; GFX7-NEXT:    s_setpc_b64 s[30:31]
33074;
33075; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
33076; GFX8:       ; %bb.0:
33077; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33078; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33079; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
33080; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
33081; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
33082; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
33083; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33084; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33085; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
33086; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
33087; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
33088; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
33089; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
33090; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33091; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
33092; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
33093; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
33094; GFX8-NEXT:    s_setpc_b64 s[30:31]
33095;
33096; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
33097; GFX9:       ; %bb.0:
33098; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33099; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33100; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
33101; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33102; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
33103; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
33104; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33105; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33106; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
33107; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
33108; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
33109; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
33110; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33111; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
33112; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
33113; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
33114; GFX9-NEXT:    s_setpc_b64 s[30:31]
33115;
33116; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
33117; GFX10:       ; %bb.0:
33118; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33119; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33120; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
33121; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
33122; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
33123; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33124; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33125; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
33126; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
33127; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
33128; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33129; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33130; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33131; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33132; GFX10-NEXT:    s_setpc_b64 s[30:31]
33133;
33134; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
33135; GFX11:       ; %bb.0:
33136; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33137; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
33138; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
33139; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33140; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
33141; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
33142; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33143; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33144; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
33145; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
33146; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
33147; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33148; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33149; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33150; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33151; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
33152; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33153; GFX11-NEXT:    s_setpc_b64 s[30:31]
33154  %op = uitofp <2 x i32> %x to <2 x bfloat>
33155  ret <2 x bfloat> %op
33156}
33157
33158define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
33159; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
33160; GCN:       ; %bb.0:
33161; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33162; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
33163; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
33164; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33165; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33166; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33167; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
33168; GCN-NEXT:    s_setpc_b64 s[30:31]
33169;
33170; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
33171; GFX7:       ; %bb.0:
33172; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33173; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33174; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
33175; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
33176; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33177; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33178; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
33179; GFX7-NEXT:    s_setpc_b64 s[30:31]
33180;
33181; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
33182; GFX8:       ; %bb.0:
33183; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33184; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
33185; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33186; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
33187; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
33188; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
33189; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
33190; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
33191; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
33192; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
33193; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
33194; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
33195; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
33196; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33197; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33198; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
33199; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
33200; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
33201; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
33202; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
33203; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33204; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
33205; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
33206; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
33207; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
33208; GFX8-NEXT:    v_mov_b32_e32 v1, v2
33209; GFX8-NEXT:    s_setpc_b64 s[30:31]
33210;
33211; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
33212; GFX9:       ; %bb.0:
33213; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33214; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
33215; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33216; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33217; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
33218; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
33219; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
33220; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
33221; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
33222; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
33223; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
33224; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
33225; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33226; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33227; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
33228; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
33229; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
33230; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
33231; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33232; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
33233; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
33234; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
33235; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
33236; GFX9-NEXT:    s_setpc_b64 s[30:31]
33237;
33238; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
33239; GFX10:       ; %bb.0:
33240; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33241; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33242; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
33243; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
33244; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
33245; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
33246; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
33247; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33248; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
33249; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
33250; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
33251; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
33252; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
33253; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
33254; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33255; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33256; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33257; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33258; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33259; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33260; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
33261; GFX10-NEXT:    s_setpc_b64 s[30:31]
33262;
33263; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33264; GFX11TRUE16:       ; %bb.0:
33265; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33266; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
33267; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
33268; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
33269; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33270; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
33271; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
33272; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
33273; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33274; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
33275; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
33276; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
33277; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
33278; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
33279; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
33280; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33281; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33282; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33283; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33284; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33285; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33286; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33287; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
33288; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
33289;
33290; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33291; GFX11FAKE16:       ; %bb.0:
33292; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33293; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
33294; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
33295; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
33296; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33297; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
33298; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
33299; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
33300; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33301; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
33302; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
33303; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
33304; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
33305; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
33306; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
33307; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33308; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33309; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33310; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33311; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33312; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33313; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33314; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
33315; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
33316  %op = uitofp <3 x i32> %x to <3 x bfloat>
33317  ret <3 x bfloat> %op
33318}
33319
33320define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
33321; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
33322; GCN:       ; %bb.0:
33323; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33324; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
33325; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
33326; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
33327; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33328; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33329; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33330; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
33331; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
33332; GCN-NEXT:    s_setpc_b64 s[30:31]
33333;
33334; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
33335; GFX7:       ; %bb.0:
33336; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33337; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33338; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
33339; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
33340; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
33341; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
33342; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
33343; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
33344; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
33345; GFX7-NEXT:    s_setpc_b64 s[30:31]
33346;
33347; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
33348; GFX8:       ; %bb.0:
33349; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33350; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
33351; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
33352; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33353; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
33354; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
33355; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
33356; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
33357; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
33358; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
33359; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
33360; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
33361; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
33362; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
33363; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
33364; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
33365; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
33366; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
33367; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
33368; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
33369; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
33370; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33371; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33372; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
33373; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
33374; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
33375; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
33376; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
33377; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33378; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
33379; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
33380; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
33381; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
33382; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
33383; GFX8-NEXT:    s_setpc_b64 s[30:31]
33384;
33385; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
33386; GFX9:       ; %bb.0:
33387; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33388; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
33389; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
33390; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33391; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33392; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
33393; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
33394; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
33395; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
33396; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
33397; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
33398; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
33399; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
33400; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
33401; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
33402; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
33403; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
33404; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
33405; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
33406; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33407; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
33408; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
33409; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
33410; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
33411; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
33412; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
33413; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
33414; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
33415; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
33416; GFX9-NEXT:    s_setpc_b64 s[30:31]
33417;
33418; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
33419; GFX10:       ; %bb.0:
33420; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33421; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
33422; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33423; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
33424; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
33425; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
33426; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
33427; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
33428; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33429; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
33430; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
33431; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
33432; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
33433; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
33434; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
33435; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33436; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33437; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
33438; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
33439; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
33440; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33441; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33442; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33443; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
33444; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33445; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33446; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
33447; GFX10-NEXT:    s_setpc_b64 s[30:31]
33448;
33449; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
33450; GFX11:       ; %bb.0:
33451; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33452; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
33453; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
33454; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
33455; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
33456; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
33457; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
33458; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
33459; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
33460; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
33461; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33462; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
33463; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
33464; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
33465; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
33466; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
33467; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33468; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33469; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
33470; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
33471; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33472; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33473; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
33474; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
33475; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33476; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
33477; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33478; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
33479; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33480; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
33481; GFX11-NEXT:    s_setpc_b64 s[30:31]
33482  %op = uitofp <4 x i32> %x to <4 x bfloat>
33483  ret <4 x bfloat> %op
33484}
33485
33486define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
33487; GCN-LABEL: v_uitofp_i64_to_bf16:
33488; GCN:       ; %bb.0:
33489; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33490; GCN-NEXT:    v_ffbh_u32_e32 v2, v1
33491; GCN-NEXT:    v_min_u32_e32 v2, 32, v2
33492; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
33493; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
33494; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
33495; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33496; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
33497; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
33498; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33499; GCN-NEXT:    s_setpc_b64 s[30:31]
33500;
33501; GFX7-LABEL: v_uitofp_i64_to_bf16:
33502; GFX7:       ; %bb.0:
33503; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33504; GFX7-NEXT:    v_ffbh_u32_e32 v2, v1
33505; GFX7-NEXT:    v_min_u32_e32 v2, 32, v2
33506; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
33507; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
33508; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
33509; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33510; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
33511; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
33512; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33513; GFX7-NEXT:    s_setpc_b64 s[30:31]
33514;
33515; GFX8-LABEL: v_uitofp_i64_to_bf16:
33516; GFX8:       ; %bb.0:
33517; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33518; GFX8-NEXT:    v_ffbh_u32_e32 v2, v1
33519; GFX8-NEXT:    v_min_u32_e32 v2, 32, v2
33520; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
33521; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
33522; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
33523; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33524; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
33525; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
33526; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
33527; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
33528; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
33529; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33530; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33531; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
33532; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33533; GFX8-NEXT:    s_setpc_b64 s[30:31]
33534;
33535; GFX9-LABEL: v_uitofp_i64_to_bf16:
33536; GFX9:       ; %bb.0:
33537; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33538; GFX9-NEXT:    v_ffbh_u32_e32 v2, v1
33539; GFX9-NEXT:    v_min_u32_e32 v2, 32, v2
33540; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
33541; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33542; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
33543; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
33544; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33545; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
33546; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
33547; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
33548; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33549; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
33550; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33551; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
33552; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33553; GFX9-NEXT:    s_setpc_b64 s[30:31]
33554;
33555; GFX10-LABEL: v_uitofp_i64_to_bf16:
33556; GFX10:       ; %bb.0:
33557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33558; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
33559; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
33560; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
33561; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
33562; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
33563; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
33564; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33565; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
33566; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
33567; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33568; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33569; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
33570; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33571; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33572; GFX10-NEXT:    s_setpc_b64 s[30:31]
33573;
33574; GFX11-LABEL: v_uitofp_i64_to_bf16:
33575; GFX11:       ; %bb.0:
33576; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33577; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
33578; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33579; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
33580; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
33581; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33582; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
33583; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
33584; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
33585; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
33586; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
33587; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
33588; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33589; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
33590; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
33591; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33592; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
33593; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33594; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33595; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33596; GFX11-NEXT:    s_setpc_b64 s[30:31]
33597  %op = uitofp i64 %x to bfloat
33598  ret bfloat %op
33599}
33600
33601define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
33602; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
33603; GCN:       ; %bb.0:
33604; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33605; GCN-NEXT:    v_ffbh_u32_e32 v4, v3
33606; GCN-NEXT:    v_ffbh_u32_e32 v5, v1
33607; GCN-NEXT:    v_min_u32_e32 v4, 32, v4
33608; GCN-NEXT:    v_min_u32_e32 v5, 32, v5
33609; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
33610; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
33611; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
33612; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 32, v5
33613; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
33614; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
33615; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
33616; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
33617; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v2
33618; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33619; GCN-NEXT:    v_ldexp_f32_e32 v1, v1, v4
33620; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v5
33621; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33622; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
33623; GCN-NEXT:    s_setpc_b64 s[30:31]
33624;
33625; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
33626; GFX7:       ; %bb.0:
33627; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33628; GFX7-NEXT:    v_ffbh_u32_e32 v4, v3
33629; GFX7-NEXT:    v_min_u32_e32 v4, 32, v4
33630; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
33631; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
33632; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
33633; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
33634; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
33635; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
33636; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
33637; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
33638; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
33639; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
33640; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33641; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v4
33642; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
33643; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
33644; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33645; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
33646; GFX7-NEXT:    s_setpc_b64 s[30:31]
33647;
33648; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
33649; GFX8:       ; %bb.0:
33650; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33651; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
33652; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
33653; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
33654; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
33655; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
33656; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33657; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v4
33658; GFX8-NEXT:    v_ldexp_f32 v4, v0, v1
33659; GFX8-NEXT:    v_bfe_u32 v0, v4, 16, 1
33660; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
33661; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v0
33662; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
33663; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
33664; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
33665; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
33666; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
33667; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
33668; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33669; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
33670; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
33671; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
33672; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
33673; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
33674; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
33675; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
33676; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33677; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33678; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
33679; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
33680; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
33681; GFX8-NEXT:    s_setpc_b64 s[30:31]
33682;
33683; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
33684; GFX9:       ; %bb.0:
33685; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33686; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
33687; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
33688; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
33689; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33690; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
33691; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
33692; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33693; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
33694; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
33695; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
33696; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
33697; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
33698; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
33699; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
33700; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
33701; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
33702; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
33703; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33704; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
33705; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
33706; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
33707; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
33708; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
33709; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
33710; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33711; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33712; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
33713; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
33714; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
33715; GFX9-NEXT:    s_setpc_b64 s[30:31]
33716;
33717; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
33718; GFX10:       ; %bb.0:
33719; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33720; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
33721; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
33722; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
33723; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
33724; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
33725; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
33726; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
33727; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
33728; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
33729; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
33730; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
33731; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
33732; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33733; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
33734; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
33735; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
33736; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
33737; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
33738; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33739; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33740; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
33741; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
33742; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
33743; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33744; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33745; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33746; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33747; GFX10-NEXT:    s_setpc_b64 s[30:31]
33748;
33749; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
33750; GFX11:       ; %bb.0:
33751; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33752; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
33753; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
33754; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33755; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
33756; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
33757; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33758; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
33759; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
33760; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33761; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
33762; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
33763; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33764; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
33765; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
33766; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
33767; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
33768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
33769; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
33770; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
33771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33772; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
33773; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
33774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33775; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
33776; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
33777; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
33778; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33779; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
33780; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
33781; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
33782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33783; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33784; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33785; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33786; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
33787; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
33788; GFX11-NEXT:    s_setpc_b64 s[30:31]
33789  %op = uitofp <2 x i64> %x to <2 x bfloat>
33790  ret <2 x bfloat> %op
33791}
33792
33793define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
33794; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
33795; GCN:       ; %bb.0:
33796; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33797; GCN-NEXT:    v_ffbh_u32_e32 v6, v5
33798; GCN-NEXT:    v_ffbh_u32_e32 v7, v3
33799; GCN-NEXT:    v_ffbh_u32_e32 v8, v1
33800; GCN-NEXT:    v_min_u32_e32 v6, 32, v6
33801; GCN-NEXT:    v_min_u32_e32 v7, 32, v7
33802; GCN-NEXT:    v_min_u32_e32 v8, 32, v8
33803; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
33804; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 32, v6
33805; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
33806; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 32, v7
33807; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
33808; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
33809; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
33810; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
33811; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
33812; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
33813; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
33814; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
33815; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v4
33816; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
33817; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
33818; GCN-NEXT:    v_ldexp_f32_e32 v3, v1, v6
33819; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v7
33820; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v8
33821; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33822; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
33823; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
33824; GCN-NEXT:    s_setpc_b64 s[30:31]
33825;
33826; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
33827; GFX7:       ; %bb.0:
33828; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33829; GFX7-NEXT:    v_ffbh_u32_e32 v6, v5
33830; GFX7-NEXT:    v_min_u32_e32 v6, 32, v6
33831; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
33832; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
33833; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
33834; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
33835; GFX7-NEXT:    v_ffbh_u32_e32 v6, v3
33836; GFX7-NEXT:    v_min_u32_e32 v6, 32, v6
33837; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
33838; GFX7-NEXT:    v_cvt_f32_u32_e32 v4, v4
33839; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
33840; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
33841; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
33842; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
33843; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
33844; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
33845; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
33846; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
33847; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
33848; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
33849; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
33850; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
33851; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
33852; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
33853; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
33854; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
33855; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
33856; GFX7-NEXT:    s_setpc_b64 s[30:31]
33857;
33858; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
33859; GFX8:       ; %bb.0:
33860; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33861; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
33862; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
33863; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
33864; GFX8-NEXT:    v_ffbh_u32_e32 v7, v1
33865; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
33866; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
33867; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
33868; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
33869; GFX8-NEXT:    v_min_u32_e32 v7, 32, v7
33870; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
33871; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
33872; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
33873; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
33874; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
33875; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
33876; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
33877; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
33878; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
33879; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
33880; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
33881; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
33882; GFX8-NEXT:    v_ffbh_u32_e32 v5, v3
33883; GFX8-NEXT:    v_min_u32_e32 v5, 32, v5
33884; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
33885; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
33886; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
33887; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
33888; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
33889; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
33890; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
33891; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
33892; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
33893; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
33894; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
33895; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33896; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
33897; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
33898; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
33899; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
33900; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
33901; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
33902; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
33903; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
33904; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
33905; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
33906; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
33907; GFX8-NEXT:    s_setpc_b64 s[30:31]
33908;
33909; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
33910; GFX9:       ; %bb.0:
33911; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33912; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
33913; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
33914; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
33915; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
33916; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
33917; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
33918; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
33919; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
33920; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
33921; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
33922; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
33923; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
33924; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
33925; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33926; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
33927; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
33928; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
33929; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
33930; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
33931; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
33932; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
33933; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
33934; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
33935; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
33936; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
33937; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
33938; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
33939; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
33940; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
33941; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
33942; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
33943; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
33944; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
33945; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
33946; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
33947; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
33948; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
33949; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33950; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
33951; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
33952; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
33953; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
33954; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
33955; GFX9-NEXT:    s_setpc_b64 s[30:31]
33956;
33957; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
33958; GFX10:       ; %bb.0:
33959; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33960; GFX10-NEXT:    v_ffbh_u32_e32 v6, v1
33961; GFX10-NEXT:    v_ffbh_u32_e32 v8, v3
33962; GFX10-NEXT:    v_ffbh_u32_e32 v7, v5
33963; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
33964; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
33965; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
33966; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
33967; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
33968; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
33969; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
33970; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
33971; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
33972; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
33973; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
33974; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
33975; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
33976; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
33977; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
33978; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
33979; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
33980; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
33981; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
33982; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
33983; GFX10-NEXT:    v_ldexp_f32 v1, v1, v7
33984; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
33985; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
33986; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
33987; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
33988; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
33989; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
33990; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
33991; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
33992; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
33993; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
33994; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33995; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
33996; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
33997; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
33998; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
33999; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34000; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
34001; GFX10-NEXT:    s_setpc_b64 s[30:31]
34002;
34003; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
34004; GFX11TRUE16:       ; %bb.0:
34005; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34006; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v6, v1
34007; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v7, v5
34008; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v8, v3
34009; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34010; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, 32, v6
34011; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, 32, v7
34012; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34013; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, 32, v8
34014; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
34015; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34016; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
34017; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
34018; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
34019; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34020; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
34021; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
34022; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34023; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
34024; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
34025; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34026; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
34027; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
34028; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
34029; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
34030; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
34031; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
34032; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
34033; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34034; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
34035; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v7
34036; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34037; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
34038; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
34039; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34040; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
34041; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
34042; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
34043; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
34044; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
34045; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
34046; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
34047; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
34048; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
34049; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34050; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
34051; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34052; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34053; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
34054; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34055; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
34056; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34057; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
34058; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34059;
34060; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
34061; GFX11FAKE16:       ; %bb.0:
34062; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34063; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v6, v1
34064; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v7, v5
34065; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v8, v3
34066; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34067; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, 32, v6
34068; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, 32, v7
34069; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34070; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, 32, v8
34071; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
34072; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34073; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
34074; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
34075; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
34076; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34077; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
34078; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
34079; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34080; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
34081; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
34082; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34083; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
34084; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
34085; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
34086; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
34087; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
34088; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
34089; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
34090; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34091; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
34092; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v7
34093; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34094; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
34095; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
34096; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34097; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
34098; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
34099; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
34100; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
34101; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
34102; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
34103; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
34104; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
34105; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
34106; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34107; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
34108; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34109; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34110; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
34111; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34112; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
34113; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
34114; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
34115; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34116  %op = uitofp <3 x i64> %x to <3 x bfloat>
34117  ret <3 x bfloat> %op
34118}
34119
34120define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
34121; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
34122; GCN:       ; %bb.0:
34123; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34124; GCN-NEXT:    v_ffbh_u32_e32 v8, v7
34125; GCN-NEXT:    v_ffbh_u32_e32 v9, v5
34126; GCN-NEXT:    v_ffbh_u32_e32 v10, v3
34127; GCN-NEXT:    v_ffbh_u32_e32 v11, v1
34128; GCN-NEXT:    v_min_u32_e32 v8, 32, v8
34129; GCN-NEXT:    v_min_u32_e32 v9, 32, v9
34130; GCN-NEXT:    v_min_u32_e32 v10, 32, v10
34131; GCN-NEXT:    v_min_u32_e32 v11, 32, v11
34132; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
34133; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
34134; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v9
34135; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 32, v9
34136; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
34137; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 32, v10
34138; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v11
34139; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 32, v11
34140; GCN-NEXT:    v_min_u32_e32 v6, 1, v6
34141; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
34142; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
34143; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
34144; GCN-NEXT:    v_or_b32_e32 v6, v7, v6
34145; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
34146; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
34147; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
34148; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v6
34149; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v4
34150; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
34151; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
34152; GCN-NEXT:    v_ldexp_f32_e32 v4, v1, v8
34153; GCN-NEXT:    v_ldexp_f32_e32 v3, v3, v9
34154; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v10
34155; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v11
34156; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34157; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
34158; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
34159; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
34160; GCN-NEXT:    s_setpc_b64 s[30:31]
34161;
34162; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
34163; GFX7:       ; %bb.0:
34164; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34165; GFX7-NEXT:    v_ffbh_u32_e32 v8, v7
34166; GFX7-NEXT:    v_min_u32_e32 v8, 32, v8
34167; GFX7-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
34168; GFX7-NEXT:    v_min_u32_e32 v6, 1, v6
34169; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
34170; GFX7-NEXT:    v_cvt_f32_u32_e32 v6, v6
34171; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 32, v8
34172; GFX7-NEXT:    v_ffbh_u32_e32 v8, v5
34173; GFX7-NEXT:    v_ldexp_f32_e32 v6, v6, v7
34174; GFX7-NEXT:    v_ffbh_u32_e32 v7, v3
34175; GFX7-NEXT:    v_min_u32_e32 v7, 32, v7
34176; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
34177; GFX7-NEXT:    v_min_u32_e32 v8, 32, v8
34178; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
34179; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v8
34180; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
34181; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
34182; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
34183; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
34184; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
34185; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
34186; GFX7-NEXT:    v_cvt_f32_u32_e32 v4, v4
34187; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
34188; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
34189; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
34190; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
34191; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v8
34192; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
34193; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v7
34194; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
34195; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
34196; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
34197; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34198; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
34199; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
34200; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
34201; GFX7-NEXT:    s_setpc_b64 s[30:31]
34202;
34203; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
34204; GFX8:       ; %bb.0:
34205; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34206; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
34207; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
34208; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
34209; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
34210; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
34211; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
34212; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
34213; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
34214; GFX8-NEXT:    v_ldexp_f32 v8, v4, v5
34215; GFX8-NEXT:    v_bfe_u32 v4, v8, 16, 1
34216; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
34217; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v4
34218; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
34219; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
34220; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
34221; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
34222; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
34223; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
34224; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
34225; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
34226; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
34227; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
34228; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
34229; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
34230; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
34231; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
34232; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
34233; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
34234; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
34235; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
34236; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
34237; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
34238; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
34239; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
34240; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
34241; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
34242; GFX8-NEXT:    v_ldexp_f32 v6, v0, v1
34243; GFX8-NEXT:    v_bfe_u32 v0, v6, 16, 1
34244; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
34245; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v0
34246; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
34247; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
34248; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
34249; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
34250; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
34251; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
34252; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
34253; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
34254; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
34255; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
34256; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
34257; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
34258; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
34259; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
34260; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
34261; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
34262; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
34263; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
34264; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
34265; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
34266; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
34267; GFX8-NEXT:    s_setpc_b64 s[30:31]
34268;
34269; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
34270; GFX9:       ; %bb.0:
34271; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34272; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
34273; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
34274; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
34275; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
34276; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
34277; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
34278; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
34279; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
34280; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
34281; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
34282; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
34283; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
34284; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
34285; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
34286; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
34287; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
34288; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
34289; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
34290; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
34291; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
34292; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
34293; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
34294; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
34295; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
34296; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
34297; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
34298; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
34299; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
34300; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
34301; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
34302; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
34303; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
34304; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
34305; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
34306; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
34307; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
34308; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
34309; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
34310; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
34311; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
34312; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
34313; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
34314; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
34315; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
34316; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
34317; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
34318; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
34319; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
34320; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
34321; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
34322; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
34323; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
34324; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
34325; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
34326; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
34327; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
34328; GFX9-NEXT:    s_setpc_b64 s[30:31]
34329;
34330; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
34331; GFX10:       ; %bb.0:
34332; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34333; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
34334; GFX10-NEXT:    v_ffbh_u32_e32 v10, v1
34335; GFX10-NEXT:    v_ffbh_u32_e32 v11, v3
34336; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
34337; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
34338; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
34339; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
34340; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
34341; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
34342; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
34343; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
34344; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
34345; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
34346; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
34347; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
34348; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
34349; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
34350; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
34351; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
34352; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
34353; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
34354; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
34355; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v11
34356; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v4
34357; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
34358; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
34359; GFX10-NEXT:    v_or_b32_e32 v6, v7, v6
34360; GFX10-NEXT:    v_ldexp_f32 v2, v2, v8
34361; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
34362; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
34363; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v6
34364; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
34365; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
34366; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
34367; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
34368; GFX10-NEXT:    v_ldexp_f32 v4, v4, v9
34369; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
34370; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
34371; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
34372; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
34373; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
34374; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34375; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
34376; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
34377; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v1
34378; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
34379; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v4
34380; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34381; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
34382; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34383; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
34384; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
34385; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34386; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
34387; GFX10-NEXT:    s_setpc_b64 s[30:31]
34388;
34389; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
34390; GFX11:       ; %bb.0:
34391; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34392; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
34393; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
34394; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v3
34395; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
34396; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34397; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
34398; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
34399; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34400; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
34401; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
34402; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34403; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
34404; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
34405; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34406; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
34407; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
34408; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
34409; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
34410; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
34411; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
34412; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
34413; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
34414; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34415; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
34416; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
34417; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
34418; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
34419; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
34420; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v11
34421; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v4
34422; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
34423; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
34424; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
34425; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34426; GFX11-NEXT:    v_ldexp_f32 v2, v2, v8
34427; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
34428; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34429; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
34430; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v6
34431; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
34432; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
34433; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
34434; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
34435; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
34436; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
34437; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
34438; GFX11-NEXT:    v_ldexp_f32 v4, v4, v9
34439; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
34440; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
34441; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
34442; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34443; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
34444; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
34445; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v1
34446; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
34447; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34448; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
34449; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
34450; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34451; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
34452; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
34453; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
34454; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34455; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34456; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
34457; GFX11-NEXT:    s_setpc_b64 s[30:31]
34458  %op = uitofp <4 x i64> %x to <4 x bfloat>
34459  ret <4 x bfloat> %op
34460}
34461
34462define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
34463; GCN-LABEL: v_select_bf16:
34464; GCN:       ; %bb.0:
34465; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34466; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34467; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34468; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
34469; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34470; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34471; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34472; GCN-NEXT:    s_setpc_b64 s[30:31]
34473;
34474; GFX7-LABEL: v_select_bf16:
34475; GFX7:       ; %bb.0:
34476; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34477; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
34478; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34479; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34480; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34481; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34482; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34483; GFX7-NEXT:    s_setpc_b64 s[30:31]
34484;
34485; GFX8-LABEL: v_select_bf16:
34486; GFX8:       ; %bb.0:
34487; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34488; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
34489; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34490; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34491; GFX8-NEXT:    s_setpc_b64 s[30:31]
34492;
34493; GFX9-LABEL: v_select_bf16:
34494; GFX9:       ; %bb.0:
34495; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34496; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
34497; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34498; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34499; GFX9-NEXT:    s_setpc_b64 s[30:31]
34500;
34501; GFX10-LABEL: v_select_bf16:
34502; GFX10:       ; %bb.0:
34503; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34504; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
34505; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34506; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34507; GFX10-NEXT:    s_setpc_b64 s[30:31]
34508;
34509; GFX11TRUE16-LABEL: v_select_bf16:
34510; GFX11TRUE16:       ; %bb.0:
34511; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34512; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
34513; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
34514; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
34515; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34516; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
34517; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
34518; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34519;
34520; GFX11FAKE16-LABEL: v_select_bf16:
34521; GFX11FAKE16:       ; %bb.0:
34522; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34523; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
34524; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34525; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34526; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34527; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34528  %op = select i1 %cond, bfloat %a, bfloat %b
34529  ret bfloat %op
34530}
34531
34532define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34533; GCN-LABEL: v_select_fneg_lhs_bf16:
34534; GCN:       ; %bb.0:
34535; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34536; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34537; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
34538; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
34539; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34540; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34541; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34542; GCN-NEXT:    s_setpc_b64 s[30:31]
34543;
34544; GFX7-LABEL: v_select_fneg_lhs_bf16:
34545; GFX7:       ; %bb.0:
34546; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34547; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
34548; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34549; GFX7-NEXT:    v_mul_f32_e32 v1, -1.0, v1
34550; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34551; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34552; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34553; GFX7-NEXT:    s_setpc_b64 s[30:31]
34554;
34555; GFX8-LABEL: v_select_fneg_lhs_bf16:
34556; GFX8:       ; %bb.0:
34557; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34558; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
34559; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
34560; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34561; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34562; GFX8-NEXT:    s_setpc_b64 s[30:31]
34563;
34564; GFX9-LABEL: v_select_fneg_lhs_bf16:
34565; GFX9:       ; %bb.0:
34566; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34567; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
34568; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
34569; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34570; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34571; GFX9-NEXT:    s_setpc_b64 s[30:31]
34572;
34573; GFX10-LABEL: v_select_fneg_lhs_bf16:
34574; GFX10:       ; %bb.0:
34575; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34576; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
34577; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
34578; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34579; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34580; GFX10-NEXT:    s_setpc_b64 s[30:31]
34581;
34582; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
34583; GFX11TRUE16:       ; %bb.0:
34584; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34585; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
34586; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
34587; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
34588; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34589; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
34590; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
34591; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34592; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
34593; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34594;
34595; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
34596; GFX11FAKE16:       ; %bb.0:
34597; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34598; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
34599; GFX11FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
34600; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34601; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34602; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34603; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34604  %neg.a = fneg bfloat %a
34605  %op = select i1 %cond, bfloat %neg.a, bfloat %b
34606  ret bfloat %op
34607}
34608
34609define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34610; GCN-LABEL: v_select_fneg_rhs_bf16:
34611; GCN:       ; %bb.0:
34612; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34613; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34614; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
34615; GCN-NEXT:    v_mul_f32_e32 v2, -1.0, v2
34616; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34617; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34618; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34619; GCN-NEXT:    s_setpc_b64 s[30:31]
34620;
34621; GFX7-LABEL: v_select_fneg_rhs_bf16:
34622; GFX7:       ; %bb.0:
34623; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34624; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
34625; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34626; GFX7-NEXT:    v_mul_f32_e32 v2, -1.0, v2
34627; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34628; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34629; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34630; GFX7-NEXT:    s_setpc_b64 s[30:31]
34631;
34632; GFX8-LABEL: v_select_fneg_rhs_bf16:
34633; GFX8:       ; %bb.0:
34634; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34635; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
34636; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
34637; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34638; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34639; GFX8-NEXT:    s_setpc_b64 s[30:31]
34640;
34641; GFX9-LABEL: v_select_fneg_rhs_bf16:
34642; GFX9:       ; %bb.0:
34643; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34644; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
34645; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
34646; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34647; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34648; GFX9-NEXT:    s_setpc_b64 s[30:31]
34649;
34650; GFX10-LABEL: v_select_fneg_rhs_bf16:
34651; GFX10:       ; %bb.0:
34652; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34653; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
34654; GFX10-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
34655; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34656; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34657; GFX10-NEXT:    s_setpc_b64 s[30:31]
34658;
34659; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
34660; GFX11TRUE16:       ; %bb.0:
34661; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34662; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
34663; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
34664; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
34665; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34666; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
34667; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
34668; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34669; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
34670; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34671;
34672; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
34673; GFX11FAKE16:       ; %bb.0:
34674; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34675; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
34676; GFX11FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
34677; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34678; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34679; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34680; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34681  %neg.b = fneg bfloat %b
34682  %op = select i1 %cond, bfloat %a, bfloat %neg.b
34683  ret bfloat %op
34684}
34685
34686define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34687; GCN-LABEL: v_select_v2bf16:
34688; GCN:       ; %bb.0:
34689; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34690; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34691; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
34692; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34693; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
34694; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
34695; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
34696; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
34697; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34698; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
34699; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34700; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
34701; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
34702; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
34703; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
34704; GCN-NEXT:    s_setpc_b64 s[30:31]
34705;
34706; GFX7-LABEL: v_select_v2bf16:
34707; GFX7:       ; %bb.0:
34708; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34709; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
34710; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
34711; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34712; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
34713; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
34714; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
34715; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
34716; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34717; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
34718; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34719; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
34720; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
34721; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
34722; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
34723; GFX7-NEXT:    s_setpc_b64 s[30:31]
34724;
34725; GFX8-LABEL: v_select_v2bf16:
34726; GFX8:       ; %bb.0:
34727; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34728; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
34729; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34730; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34731; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
34732; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34733; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
34734; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
34735; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34736; GFX8-NEXT:    s_setpc_b64 s[30:31]
34737;
34738; GFX9-LABEL: v_select_v2bf16:
34739; GFX9:       ; %bb.0:
34740; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34741; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
34742; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34743; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34744; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
34745; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34746; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
34747; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
34748; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
34749; GFX9-NEXT:    s_setpc_b64 s[30:31]
34750;
34751; GFX10-LABEL: v_select_v2bf16:
34752; GFX10:       ; %bb.0:
34753; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34754; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
34755; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
34756; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34757; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34758; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34759; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
34760; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34761; GFX10-NEXT:    s_setpc_b64 s[30:31]
34762;
34763; GFX11TRUE16-LABEL: v_select_v2bf16:
34764; GFX11TRUE16:       ; %bb.0:
34765; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34766; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
34767; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
34768; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34769; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34770; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34771; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo
34772; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
34773; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34774; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
34775; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
34776; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34777; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34778; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34779;
34780; GFX11FAKE16-LABEL: v_select_v2bf16:
34781; GFX11FAKE16:       ; %bb.0:
34782; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34783; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
34784; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34785; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
34786; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
34787; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34788; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
34789; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34790; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34791; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34792  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
34793  ret <2 x bfloat> %op
34794}
34795
34796define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34797; GCN-LABEL: v_vselect_v2bf16:
34798; GCN:       ; %bb.0:
34799; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34800; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34801; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
34802; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
34803; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
34804; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
34805; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
34806; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
34807; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
34808; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34809; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
34810; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34811; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
34812; GCN-NEXT:    s_setpc_b64 s[30:31]
34813;
34814; GFX7-LABEL: v_vselect_v2bf16:
34815; GFX7:       ; %bb.0:
34816; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34817; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
34818; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
34819; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
34820; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
34821; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
34822; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
34823; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
34824; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
34825; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34826; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
34827; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
34828; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
34829; GFX7-NEXT:    s_setpc_b64 s[30:31]
34830;
34831; GFX8-LABEL: v_vselect_v2bf16:
34832; GFX8:       ; %bb.0:
34833; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34834; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
34835; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
34836; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34837; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
34838; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34839; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
34840; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
34841; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
34842; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
34843; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34844; GFX8-NEXT:    s_setpc_b64 s[30:31]
34845;
34846; GFX9-LABEL: v_vselect_v2bf16:
34847; GFX9:       ; %bb.0:
34848; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34849; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
34850; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
34851; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
34852; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
34853; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
34854; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
34855; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
34856; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
34857; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
34858; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
34859; GFX9-NEXT:    s_setpc_b64 s[30:31]
34860;
34861; GFX10-LABEL: v_vselect_v2bf16:
34862; GFX10:       ; %bb.0:
34863; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34864; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
34865; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
34866; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34867; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
34868; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34869; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
34870; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
34871; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34872; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34873; GFX10-NEXT:    s_setpc_b64 s[30:31]
34874;
34875; GFX11TRUE16-LABEL: v_vselect_v2bf16:
34876; GFX11TRUE16:       ; %bb.0:
34877; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34878; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
34879; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
34880; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34881; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
34882; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34883; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
34884; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
34885; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34886; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo
34887; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, s0
34888; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34889; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
34890; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
34891; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34892; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34893; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
34894;
34895; GFX11FAKE16-LABEL: v_vselect_v2bf16:
34896; GFX11FAKE16:       ; %bb.0:
34897; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34898; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
34899; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
34900; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
34901; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
34902; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
34903; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
34904; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
34905; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
34906; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34907; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
34908; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
34909  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
34910  ret <2 x bfloat> %op
34911}
34912
34913define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
34914; GCN-LABEL: s_select_bf16:
34915; GCN:       ; %bb.0:
34916; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s0
34917; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s1
34918; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
34919; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34920; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
34921; GCN-NEXT:    v_readfirstlane_b32 s0, v0
34922; GCN-NEXT:    ; return to shader part epilog
34923;
34924; GFX7-LABEL: s_select_bf16:
34925; GFX7:       ; %bb.0:
34926; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s0
34927; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s1
34928; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
34929; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
34930; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
34931; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
34932; GFX7-NEXT:    ; return to shader part epilog
34933;
34934; GFX8-LABEL: s_select_bf16:
34935; GFX8:       ; %bb.0:
34936; GFX8-NEXT:    v_mov_b32_e32 v1, s1
34937; GFX8-NEXT:    v_mov_b32_e32 v2, s0
34938; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
34939; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
34940; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
34941; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
34942; GFX8-NEXT:    ; return to shader part epilog
34943;
34944; GFX9-LABEL: s_select_bf16:
34945; GFX9:       ; %bb.0:
34946; GFX9-NEXT:    v_mov_b32_e32 v1, s1
34947; GFX9-NEXT:    v_mov_b32_e32 v2, s0
34948; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
34949; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
34950; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
34951; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
34952; GFX9-NEXT:    ; return to shader part epilog
34953;
34954; GFX10-LABEL: s_select_bf16:
34955; GFX10:       ; %bb.0:
34956; GFX10-NEXT:    v_mov_b32_e32 v1, s0
34957; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
34958; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34959; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
34960; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
34961; GFX10-NEXT:    ; return to shader part epilog
34962;
34963; GFX11TRUE16-LABEL: s_select_bf16:
34964; GFX11TRUE16:       ; %bb.0:
34965; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
34966; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
34967; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
34968; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
34969; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
34970; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34971; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
34972; GFX11TRUE16-NEXT:    ; return to shader part epilog
34973;
34974; GFX11FAKE16-LABEL: s_select_bf16:
34975; GFX11FAKE16:       ; %bb.0:
34976; GFX11FAKE16-NEXT:    v_mov_b32_e32 v1, s0
34977; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
34978; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
34979; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34980; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
34981; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
34982; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
34983; GFX11FAKE16-NEXT:    ; return to shader part epilog
34984  %cond = icmp eq i32 %c, 0
34985  %op = select i1 %cond, bfloat %a, bfloat %b
34986  %cast = bitcast bfloat %op to i16
34987  %zext = zext i16 %cast to i32
34988  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
34989  ret i32 %readlane
34990}
34991
34992define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
34993; GCN-LABEL: s_select_v2bf16:
34994; GCN:       ; %bb.0:
34995; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s1
34996; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s3
34997; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s0
34998; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s2
34999; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
35000; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35001; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
35002; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35003; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35004; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
35005; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35006; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
35007; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
35008; GCN-NEXT:    v_readfirstlane_b32 s0, v0
35009; GCN-NEXT:    ; return to shader part epilog
35010;
35011; GFX7-LABEL: s_select_v2bf16:
35012; GFX7:       ; %bb.0:
35013; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
35014; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s3
35015; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
35016; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35017; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s0
35018; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s2
35019; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35020; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
35021; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35022; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35023; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
35024; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
35025; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
35026; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
35027; GFX7-NEXT:    ; return to shader part epilog
35028;
35029; GFX8-LABEL: s_select_v2bf16:
35030; GFX8:       ; %bb.0:
35031; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
35032; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
35033; GFX8-NEXT:    v_mov_b32_e32 v1, s3
35034; GFX8-NEXT:    v_mov_b32_e32 v2, s2
35035; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35036; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
35037; GFX8-NEXT:    v_mov_b32_e32 v1, s1
35038; GFX8-NEXT:    v_mov_b32_e32 v2, s0
35039; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
35040; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
35041; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
35042; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
35043; GFX8-NEXT:    ; return to shader part epilog
35044;
35045; GFX9-LABEL: s_select_v2bf16:
35046; GFX9:       ; %bb.0:
35047; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
35048; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
35049; GFX9-NEXT:    v_mov_b32_e32 v1, s3
35050; GFX9-NEXT:    v_mov_b32_e32 v2, s2
35051; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35052; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
35053; GFX9-NEXT:    v_mov_b32_e32 v1, s1
35054; GFX9-NEXT:    v_mov_b32_e32 v2, s0
35055; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
35056; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
35057; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
35058; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
35059; GFX9-NEXT:    ; return to shader part epilog
35060;
35061; GFX10-LABEL: s_select_v2bf16:
35062; GFX10:       ; %bb.0:
35063; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
35064; GFX10-NEXT:    v_mov_b32_e32 v2, s0
35065; GFX10-NEXT:    v_mov_b32_e32 v1, s2
35066; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35067; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
35068; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35069; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35070; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
35071; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
35072; GFX10-NEXT:    ; return to shader part epilog
35073;
35074; GFX11TRUE16-LABEL: s_select_v2bf16:
35075; GFX11TRUE16:       ; %bb.0:
35076; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
35077; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
35078; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35079; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
35080; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
35081; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
35082; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
35083; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
35084; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
35085; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
35086; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35087; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
35088; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
35089; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35090; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
35091; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
35092; GFX11TRUE16-NEXT:    ; return to shader part epilog
35093;
35094; GFX11FAKE16-LABEL: s_select_v2bf16:
35095; GFX11FAKE16:       ; %bb.0:
35096; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
35097; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35098; GFX11FAKE16-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
35099; GFX11FAKE16-NEXT:    s_lshr_b32 s3, s1, 16
35100; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35101; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35102; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
35103; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35104; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
35105; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35106; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
35107; GFX11FAKE16-NEXT:    ; return to shader part epilog
35108  %cond = icmp eq i32 %c, 0
35109  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
35110  %cast = bitcast <2 x bfloat> %op to i32
35111  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35112  ret i32 %readlane
35113}
35114
35115define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
35116; GCN-LABEL: s_vselect_v2bf16:
35117; GCN:       ; %bb.0:
35118; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s0
35119; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s2
35120; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s1
35121; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s3
35122; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
35123; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
35124; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35125; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
35126; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35127; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
35128; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
35129; GCN-NEXT:    v_readfirstlane_b32 s0, v0
35130; GCN-NEXT:    ; return to shader part epilog
35131;
35132; GFX7-LABEL: s_vselect_v2bf16:
35133; GFX7:       ; %bb.0:
35134; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s1
35135; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s3
35136; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
35137; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s0
35138; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s2
35139; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
35140; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35141; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
35142; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35143; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
35144; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
35145; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
35146; GFX7-NEXT:    ; return to shader part epilog
35147;
35148; GFX8-LABEL: s_vselect_v2bf16:
35149; GFX8:       ; %bb.0:
35150; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
35151; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
35152; GFX8-NEXT:    v_mov_b32_e32 v2, s3
35153; GFX8-NEXT:    v_mov_b32_e32 v3, s2
35154; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
35155; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
35156; GFX8-NEXT:    v_mov_b32_e32 v2, s1
35157; GFX8-NEXT:    v_mov_b32_e32 v3, s0
35158; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35159; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
35160; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
35161; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
35162; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
35163; GFX8-NEXT:    ; return to shader part epilog
35164;
35165; GFX9-LABEL: s_vselect_v2bf16:
35166; GFX9:       ; %bb.0:
35167; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
35168; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
35169; GFX9-NEXT:    v_mov_b32_e32 v2, s3
35170; GFX9-NEXT:    v_mov_b32_e32 v3, s2
35171; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
35172; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
35173; GFX9-NEXT:    v_mov_b32_e32 v2, s1
35174; GFX9-NEXT:    v_mov_b32_e32 v3, s0
35175; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
35176; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
35177; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
35178; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s0
35179; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
35180; GFX9-NEXT:    ; return to shader part epilog
35181;
35182; GFX10-LABEL: s_vselect_v2bf16:
35183; GFX10:       ; %bb.0:
35184; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
35185; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
35186; GFX10-NEXT:    v_mov_b32_e32 v2, s2
35187; GFX10-NEXT:    v_mov_b32_e32 v3, s0
35188; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
35189; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35190; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35191; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35192; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
35193; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
35194; GFX10-NEXT:    ; return to shader part epilog
35195;
35196; GFX11TRUE16-LABEL: s_vselect_v2bf16:
35197; GFX11TRUE16:       ; %bb.0:
35198; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
35199; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
35200; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35201; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
35202; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
35203; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s4
35204; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
35205; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
35206; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
35207; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s2
35208; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
35209; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35210; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
35211; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
35212; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35213; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
35214; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
35215; GFX11TRUE16-NEXT:    ; return to shader part epilog
35216;
35217; GFX11FAKE16-LABEL: s_vselect_v2bf16:
35218; GFX11FAKE16:       ; %bb.0:
35219; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
35220; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
35221; GFX11FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
35222; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s1, 16
35223; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35224; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35225; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
35226; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
35227; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35228; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
35229; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35230; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
35231; GFX11FAKE16-NEXT:    ; return to shader part epilog
35232  %cond = icmp eq <2 x i32> %c, zeroinitializer
35233  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
35234  %cast = bitcast <2 x bfloat> %op to i32
35235  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35236  ret i32 %readlane
35237}
35238
35239define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
35240; GCN-LABEL: v_select_v3bf16:
35241; GCN:       ; %bb.0:
35242; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35243; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35244; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35245; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35246; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35247; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35248; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35249; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35250; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
35251; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35252; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35253; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
35254; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35255; GCN-NEXT:    v_alignbit_b32 v2, v5, v4, 16
35256; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35257; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35258; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
35259; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
35260; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35261; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35262; GCN-NEXT:    s_setpc_b64 s[30:31]
35263;
35264; GFX7-LABEL: v_select_v3bf16:
35265; GFX7:       ; %bb.0:
35266; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35267; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35268; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35269; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35270; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35271; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v5
35272; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35273; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35274; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35275; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35276; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
35277; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
35278; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35279; GFX7-NEXT:    v_alignbit_b32 v2, v2, v4, 16
35280; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35281; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35282; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
35283; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
35284; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35285; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35286; GFX7-NEXT:    s_setpc_b64 s[30:31]
35287;
35288; GFX8-LABEL: v_select_v3bf16:
35289; GFX8:       ; %bb.0:
35290; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35291; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
35292; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35293; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
35294; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
35295; GFX8-NEXT:    s_setpc_b64 s[30:31]
35296;
35297; GFX9-LABEL: v_select_v3bf16:
35298; GFX9:       ; %bb.0:
35299; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35300; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
35301; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35302; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
35303; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
35304; GFX9-NEXT:    s_setpc_b64 s[30:31]
35305;
35306; GFX10-LABEL: v_select_v3bf16:
35307; GFX10:       ; %bb.0:
35308; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35309; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
35310; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35311; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35312; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35313; GFX10-NEXT:    s_setpc_b64 s[30:31]
35314;
35315; GFX11-LABEL: v_select_v3bf16:
35316; GFX11:       ; %bb.0:
35317; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35318; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
35319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35320; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35321; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35322; GFX11-NEXT:    s_setpc_b64 s[30:31]
35323  %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
35324  ret <3 x bfloat> %op
35325}
35326
35327define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
35328; GCN-LABEL: v_select_v4bf16:
35329; GCN:       ; %bb.0:
35330; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35331; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35332; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35333; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35334; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35335; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35336; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35337; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35338; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35339; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35340; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35341; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35342; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35343; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35344; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35345; GCN-NEXT:    v_alignbit_b32 v2, v6, v5, 16
35346; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35347; GCN-NEXT:    v_alignbit_b32 v4, v8, v7, 16
35348; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35349; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35350; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35351; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35352; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35353; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35354; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35355; GCN-NEXT:    s_setpc_b64 s[30:31]
35356;
35357; GFX7-LABEL: v_select_v4bf16:
35358; GFX7:       ; %bb.0:
35359; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35360; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35361; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35362; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35363; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35364; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35365; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v6
35366; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35367; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35368; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35369; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35370; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35371; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v8
35372; GFX7-NEXT:    v_alignbit_b32 v2, v2, v5, 16
35373; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35374; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
35375; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
35376; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
35377; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35378; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35379; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35380; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35381; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35382; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35383; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35384; GFX7-NEXT:    s_setpc_b64 s[30:31]
35385;
35386; GFX8-LABEL: v_select_v4bf16:
35387; GFX8:       ; %bb.0:
35388; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35389; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
35390; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35391; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
35392; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
35393; GFX8-NEXT:    s_setpc_b64 s[30:31]
35394;
35395; GFX9-LABEL: v_select_v4bf16:
35396; GFX9:       ; %bb.0:
35397; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35398; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
35399; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35400; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
35401; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
35402; GFX9-NEXT:    s_setpc_b64 s[30:31]
35403;
35404; GFX10-LABEL: v_select_v4bf16:
35405; GFX10:       ; %bb.0:
35406; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35407; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
35408; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35409; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35410; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35411; GFX10-NEXT:    s_setpc_b64 s[30:31]
35412;
35413; GFX11-LABEL: v_select_v4bf16:
35414; GFX11:       ; %bb.0:
35415; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35416; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
35417; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35418; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35419; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35420; GFX11-NEXT:    s_setpc_b64 s[30:31]
35421  %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
35422  ret <4 x bfloat> %op
35423}
35424
35425define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
35426; GCN-LABEL: v_select_v6bf16:
35427; GCN:       ; %bb.0:
35428; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35429; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35430; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35431; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35432; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35433; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35434; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35435; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
35436; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
35437; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35438; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35439; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
35440; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
35441; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35442; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35443; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35444; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35445; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
35446; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35447; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
35448; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35449; GCN-NEXT:    v_alignbit_b32 v2, v8, v7, 16
35450; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35451; GCN-NEXT:    v_alignbit_b32 v4, v10, v9, 16
35452; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35453; GCN-NEXT:    v_alignbit_b32 v6, v12, v11, 16
35454; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35455; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
35456; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35457; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35458; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35459; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35460; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35461; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35462; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35463; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35464; GCN-NEXT:    s_setpc_b64 s[30:31]
35465;
35466; GFX7-LABEL: v_select_v6bf16:
35467; GFX7:       ; %bb.0:
35468; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35469; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35470; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35471; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35472; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35473; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35474; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v8
35475; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35476; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35477; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35478; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35479; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35480; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35481; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v10
35482; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35483; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35484; GFX7-NEXT:    v_alignbit_b32 v2, v2, v7, 16
35485; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35486; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v9
35487; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35488; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v12
35489; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
35490; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35491; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v11
35492; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
35493; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
35494; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35495; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
35496; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35497; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35498; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35499; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35500; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35501; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35502; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35503; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35504; GFX7-NEXT:    s_setpc_b64 s[30:31]
35505;
35506; GFX8-LABEL: v_select_v6bf16:
35507; GFX8:       ; %bb.0:
35508; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35509; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
35510; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35511; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
35512; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
35513; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
35514; GFX8-NEXT:    s_setpc_b64 s[30:31]
35515;
35516; GFX9-LABEL: v_select_v6bf16:
35517; GFX9:       ; %bb.0:
35518; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35519; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
35520; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35521; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
35522; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
35523; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
35524; GFX9-NEXT:    s_setpc_b64 s[30:31]
35525;
35526; GFX10-LABEL: v_select_v6bf16:
35527; GFX10:       ; %bb.0:
35528; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35529; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
35530; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35531; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
35532; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
35533; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35534; GFX10-NEXT:    s_setpc_b64 s[30:31]
35535;
35536; GFX11-LABEL: v_select_v6bf16:
35537; GFX11:       ; %bb.0:
35538; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35539; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
35540; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35541; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35542; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
35543; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35544; GFX11-NEXT:    s_setpc_b64 s[30:31]
35545  %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
35546  ret <6 x bfloat> %op
35547}
35548
35549define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
35550; GCN-LABEL: v_select_v8bf16:
35551; GCN:       ; %bb.0:
35552; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35553; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35554; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35555; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
35556; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
35557; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35558; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35559; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
35560; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
35561; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35562; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35563; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
35564; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
35565; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35566; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35567; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
35568; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
35569; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35570; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35571; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
35572; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35573; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
35574; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35575; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
35576; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35577; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
35578; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35579; GCN-NEXT:    v_alignbit_b32 v2, v10, v9, 16
35580; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35581; GCN-NEXT:    v_alignbit_b32 v4, v12, v11, 16
35582; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35583; GCN-NEXT:    v_alignbit_b32 v6, v14, v13, 16
35584; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
35585; GCN-NEXT:    v_alignbit_b32 v8, v16, v15, 16
35586; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35587; GCN-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
35588; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
35589; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35590; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35591; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35592; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35593; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35594; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35595; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35596; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35597; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
35598; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
35599; GCN-NEXT:    s_setpc_b64 s[30:31]
35600;
35601; GFX7-LABEL: v_select_v8bf16:
35602; GFX7:       ; %bb.0:
35603; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35604; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35605; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35606; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35607; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35608; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35609; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v10
35610; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35611; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35612; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35613; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35614; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
35615; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35616; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v12
35617; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35618; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35619; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35620; GFX7-NEXT:    v_alignbit_b32 v2, v2, v9, 16
35621; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35622; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v11
35623; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35624; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v14
35625; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35626; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35627; GFX7-NEXT:    v_alignbit_b32 v4, v4, v9, 16
35628; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35629; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v13
35630; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
35631; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v16
35632; GFX7-NEXT:    v_alignbit_b32 v6, v6, v9, 16
35633; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35634; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v15
35635; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
35636; GFX7-NEXT:    v_alignbit_b32 v8, v8, v9, 16
35637; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35638; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
35639; GFX7-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
35640; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35641; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35642; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35643; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35644; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35645; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35646; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35647; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35648; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
35649; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
35650; GFX7-NEXT:    s_setpc_b64 s[30:31]
35651;
35652; GFX8-LABEL: v_select_v8bf16:
35653; GFX8:       ; %bb.0:
35654; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35655; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
35656; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35657; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
35658; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
35659; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
35660; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
35661; GFX8-NEXT:    s_setpc_b64 s[30:31]
35662;
35663; GFX9-LABEL: v_select_v8bf16:
35664; GFX9:       ; %bb.0:
35665; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35666; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
35667; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35668; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
35669; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
35670; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
35671; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
35672; GFX9-NEXT:    s_setpc_b64 s[30:31]
35673;
35674; GFX10-LABEL: v_select_v8bf16:
35675; GFX10:       ; %bb.0:
35676; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35677; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
35678; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35679; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
35680; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
35681; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc_lo
35682; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc_lo
35683; GFX10-NEXT:    s_setpc_b64 s[30:31]
35684;
35685; GFX11-LABEL: v_select_v8bf16:
35686; GFX11:       ; %bb.0:
35687; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35688; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
35689; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35690; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35691; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
35692; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
35693; GFX11-NEXT:    s_setpc_b64 s[30:31]
35694  %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
35695  ret <8 x bfloat> %op
35696}
35697
35698define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
35699; GCN-LABEL: v_select_v16bf16:
35700; GCN:       ; %bb.0:
35701; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35702; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35703; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35704; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35705; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35706; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v18
35707; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
35708; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35709; GCN-NEXT:    v_alignbit_b32 v2, v2, v17, 16
35710; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35711; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35712; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35713; GCN-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35714; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35715; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v20
35716; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v19
35717; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35718; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35719; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v22
35720; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v21
35721; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35722; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35723; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v24
35724; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v23
35725; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
35726; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
35727; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v26
35728; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v25
35729; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
35730; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
35731; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v28
35732; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v27
35733; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
35734; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
35735; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v30
35736; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v29
35737; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
35738; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
35739; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35740; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35741; GCN-NEXT:    v_alignbit_b32 v4, v4, v17, 16
35742; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35743; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
35744; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32
35745; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
35746; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35747; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
35748; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
35749; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
35750; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
35751; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
35752; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
35753; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
35754; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
35755; GCN-NEXT:    v_alignbit_b32 v18, v18, v19, 16
35756; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
35757; GCN-NEXT:    v_alignbit_b32 v8, v20, v21, 16
35758; GCN-NEXT:    v_alignbit_b32 v9, v10, v9, 16
35759; GCN-NEXT:    v_alignbit_b32 v10, v22, v23, 16
35760; GCN-NEXT:    v_alignbit_b32 v11, v12, v11, 16
35761; GCN-NEXT:    v_alignbit_b32 v12, v24, v25, 16
35762; GCN-NEXT:    v_alignbit_b32 v13, v14, v13, 16
35763; GCN-NEXT:    v_alignbit_b32 v14, v26, v27, 16
35764; GCN-NEXT:    v_alignbit_b32 v15, v16, v15, 16
35765; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35766; GCN-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
35767; GCN-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
35768; GCN-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
35769; GCN-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
35770; GCN-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
35771; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35772; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35773; GCN-NEXT:    s_waitcnt vmcnt(1)
35774; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v6
35775; GCN-NEXT:    s_waitcnt vmcnt(0)
35776; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v17
35777; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35778; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35779; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35780; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35781; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35782; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35783; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
35784; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
35785; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
35786; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
35787; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
35788; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
35789; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
35790; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
35791; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
35792; GCN-NEXT:    v_alignbit_b32 v14, v14, v16, 16
35793; GCN-NEXT:    v_cndmask_b32_e32 v15, v14, v15, vcc
35794; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
35795; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
35796; GCN-NEXT:    s_setpc_b64 s[30:31]
35797;
35798; GFX7-LABEL: v_select_v16bf16:
35799; GFX7:       ; %bb.0:
35800; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35801; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
35802; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35803; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35804; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
35805; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
35806; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v18
35807; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35808; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
35809; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35810; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
35811; GFX7-NEXT:    v_alignbit_b32 v3, v4, v3, 16
35812; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v20
35813; GFX7-NEXT:    v_alignbit_b32 v2, v2, v17, 16
35814; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35815; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v19
35816; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35817; GFX7-NEXT:    v_alignbit_b32 v4, v4, v17, 16
35818; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35819; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
35820; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
35821; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
35822; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32
35823; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
35824; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v22
35825; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35826; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
35827; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
35828; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
35829; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v21
35830; GFX7-NEXT:    v_alignbit_b32 v7, v8, v7, 16
35831; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v24
35832; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
35833; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
35834; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
35835; GFX7-NEXT:    v_alignbit_b32 v18, v18, v19, 16
35836; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35837; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v23
35838; GFX7-NEXT:    v_alignbit_b32 v9, v10, v9, 16
35839; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v26
35840; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
35841; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
35842; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
35843; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
35844; GFX7-NEXT:    v_alignbit_b32 v8, v8, v19, 16
35845; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
35846; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v25
35847; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
35848; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v28
35849; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
35850; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
35851; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
35852; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
35853; GFX7-NEXT:    v_alignbit_b32 v10, v10, v19, 16
35854; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
35855; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v27
35856; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
35857; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v30
35858; GFX7-NEXT:    v_alignbit_b32 v15, v16, v15, 16
35859; GFX7-NEXT:    v_alignbit_b32 v12, v12, v19, 16
35860; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
35861; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v29
35862; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
35863; GFX7-NEXT:    v_alignbit_b32 v14, v14, v19, 16
35864; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35865; GFX7-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
35866; GFX7-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
35867; GFX7-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
35868; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
35869; GFX7-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
35870; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
35871; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
35872; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
35873; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
35874; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
35875; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
35876; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
35877; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
35878; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
35879; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
35880; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
35881; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
35882; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
35883; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
35884; GFX7-NEXT:    s_waitcnt vmcnt(1)
35885; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v17
35886; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
35887; GFX7-NEXT:    s_waitcnt vmcnt(0)
35888; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
35889; GFX7-NEXT:    v_alignbit_b32 v6, v16, v6, 16
35890; GFX7-NEXT:    v_cndmask_b32_e32 v15, v6, v15, vcc
35891; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
35892; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
35893; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
35894; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
35895; GFX7-NEXT:    s_setpc_b64 s[30:31]
35896;
35897; GFX8-LABEL: v_select_v16bf16:
35898; GFX8:       ; %bb.0:
35899; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35900; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
35901; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35902; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
35903; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
35904; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
35905; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
35906; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
35907; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
35908; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
35909; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
35910; GFX8-NEXT:    s_setpc_b64 s[30:31]
35911;
35912; GFX9-LABEL: v_select_v16bf16:
35913; GFX9:       ; %bb.0:
35914; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35915; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
35916; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35917; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
35918; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
35919; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
35920; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
35921; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
35922; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
35923; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
35924; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
35925; GFX9-NEXT:    s_setpc_b64 s[30:31]
35926;
35927; GFX10-LABEL: v_select_v16bf16:
35928; GFX10:       ; %bb.0:
35929; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35930; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
35931; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35932; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc_lo
35933; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc_lo
35934; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
35935; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
35936; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc_lo
35937; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc_lo
35938; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc_lo
35939; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc_lo
35940; GFX10-NEXT:    s_setpc_b64 s[30:31]
35941;
35942; GFX11-LABEL: v_select_v16bf16:
35943; GFX11:       ; %bb.0:
35944; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35945; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
35946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
35947; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
35948; GFX11-NEXT:    v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
35949; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
35950; GFX11-NEXT:    v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
35951; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
35952; GFX11-NEXT:    s_setpc_b64 s[30:31]
35953  %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
35954  ret <16 x bfloat> %op
35955}
35956
35957define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
35958; GCN-LABEL: v_select_v32bf16:
35959; GCN:       ; %bb.0:
35960; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35961; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
35962; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
35963; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v2
35964; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35965; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
35966; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 16
35967; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v4
35968; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v3
35969; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
35970; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
35971; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v6
35972; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v5
35973; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
35974; GCN-NEXT:    v_alignbit_b32 v2, v2, v3, 16
35975; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v8
35976; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v7
35977; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
35978; GCN-NEXT:    v_alignbit_b32 v3, v3, v4, 16
35979; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v10
35980; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v9
35981; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
35982; GCN-NEXT:    v_alignbit_b32 v4, v4, v5, 16
35983; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v12
35984; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v11
35985; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
35986; GCN-NEXT:    v_alignbit_b32 v5, v5, v6, 16
35987; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v14
35988; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v13
35989; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
35990; GCN-NEXT:    v_alignbit_b32 v6, v6, v7, 16
35991; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v16
35992; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v15
35993; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
35994; GCN-NEXT:    v_alignbit_b32 v7, v7, v8, 16
35995; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v18
35996; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v17
35997; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
35998; GCN-NEXT:    v_alignbit_b32 v8, v8, v9, 16
35999; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v20
36000; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v19
36001; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
36002; GCN-NEXT:    v_alignbit_b32 v9, v9, v10, 16
36003; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
36004; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v22
36005; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v21
36006; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
36007; GCN-NEXT:    v_alignbit_b32 v10, v10, v11, 16
36008; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
36009; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v24
36010; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v23
36011; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
36012; GCN-NEXT:    v_alignbit_b32 v11, v11, v12, 16
36013; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
36014; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v26
36015; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v25
36016; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
36017; GCN-NEXT:    v_alignbit_b32 v12, v12, v13, 16
36018; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
36019; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v28
36020; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v27
36021; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
36022; GCN-NEXT:    v_alignbit_b32 v13, v13, v14, 16
36023; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
36024; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v30
36025; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v29
36026; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
36027; GCN-NEXT:    v_alignbit_b32 v14, v14, v20, 16
36028; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:24
36029; GCN-NEXT:    s_waitcnt vmcnt(5)
36030; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
36031; GCN-NEXT:    s_waitcnt vmcnt(4)
36032; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
36033; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
36034; GCN-NEXT:    v_alignbit_b32 v15, v15, v16, 16
36035; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
36036; GCN-NEXT:    s_waitcnt vmcnt(4)
36037; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v17
36038; GCN-NEXT:    s_waitcnt vmcnt(3)
36039; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v18
36040; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
36041; GCN-NEXT:    v_alignbit_b32 v16, v16, v17, 16
36042; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:32
36043; GCN-NEXT:    s_waitcnt vmcnt(3)
36044; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v19
36045; GCN-NEXT:    s_waitcnt vmcnt(2)
36046; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
36047; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
36048; GCN-NEXT:    v_alignbit_b32 v17, v17, v19, 16
36049; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44
36050; GCN-NEXT:    s_waitcnt vmcnt(2)
36051; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
36052; GCN-NEXT:    s_waitcnt vmcnt(1)
36053; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
36054; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40
36055; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
36056; GCN-NEXT:    v_alignbit_b32 v18, v20, v18, 16
36057; GCN-NEXT:    s_waitcnt vmcnt(1)
36058; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
36059; GCN-NEXT:    s_waitcnt vmcnt(0)
36060; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
36061; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52
36062; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48
36063; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
36064; GCN-NEXT:    v_alignbit_b32 v19, v19, v20, 16
36065; GCN-NEXT:    s_waitcnt vmcnt(1)
36066; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
36067; GCN-NEXT:    s_waitcnt vmcnt(0)
36068; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
36069; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
36070; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:56
36071; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
36072; GCN-NEXT:    v_alignbit_b32 v20, v20, v21, 16
36073; GCN-NEXT:    s_waitcnt vmcnt(1)
36074; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v22
36075; GCN-NEXT:    s_waitcnt vmcnt(0)
36076; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
36077; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:68
36078; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:64
36079; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
36080; GCN-NEXT:    v_alignbit_b32 v21, v21, v22, 16
36081; GCN-NEXT:    s_waitcnt vmcnt(1)
36082; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v23
36083; GCN-NEXT:    s_waitcnt vmcnt(0)
36084; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
36085; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76
36086; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
36087; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
36088; GCN-NEXT:    v_alignbit_b32 v22, v22, v23, 16
36089; GCN-NEXT:    s_waitcnt vmcnt(1)
36090; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v24
36091; GCN-NEXT:    s_waitcnt vmcnt(0)
36092; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
36093; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:84
36094; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
36095; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
36096; GCN-NEXT:    v_alignbit_b32 v23, v23, v24, 16
36097; GCN-NEXT:    s_waitcnt vmcnt(1)
36098; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v25
36099; GCN-NEXT:    s_waitcnt vmcnt(0)
36100; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
36101; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
36102; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:88
36103; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
36104; GCN-NEXT:    v_alignbit_b32 v24, v24, v25, 16
36105; GCN-NEXT:    s_waitcnt vmcnt(1)
36106; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v26
36107; GCN-NEXT:    s_waitcnt vmcnt(0)
36108; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
36109; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:100
36110; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:96
36111; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
36112; GCN-NEXT:    v_alignbit_b32 v25, v25, v26, 16
36113; GCN-NEXT:    s_waitcnt vmcnt(1)
36114; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v27
36115; GCN-NEXT:    s_waitcnt vmcnt(0)
36116; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
36117; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108
36118; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
36119; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
36120; GCN-NEXT:    v_alignbit_b32 v26, v26, v27, 16
36121; GCN-NEXT:    s_waitcnt vmcnt(1)
36122; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v28
36123; GCN-NEXT:    s_waitcnt vmcnt(0)
36124; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
36125; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116
36126; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:112
36127; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
36128; GCN-NEXT:    v_alignbit_b32 v27, v27, v28, 16
36129; GCN-NEXT:    s_waitcnt vmcnt(1)
36130; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v29
36131; GCN-NEXT:    s_waitcnt vmcnt(0)
36132; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v30
36133; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
36134; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
36135; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
36136; GCN-NEXT:    v_alignbit_b32 v28, v28, v29, 16
36137; GCN-NEXT:    s_waitcnt vmcnt(1)
36138; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v30
36139; GCN-NEXT:    s_waitcnt vmcnt(0)
36140; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v31
36141; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
36142; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
36143; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
36144; GCN-NEXT:    v_alignbit_b32 v29, v29, v30, 16
36145; GCN-NEXT:    s_waitcnt vmcnt(1)
36146; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v31
36147; GCN-NEXT:    s_waitcnt vmcnt(0)
36148; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
36149; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
36150; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
36151; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
36152; GCN-NEXT:    v_alignbit_b32 v30, v30, v31, 16
36153; GCN-NEXT:    s_waitcnt vmcnt(1)
36154; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v32
36155; GCN-NEXT:    s_waitcnt vmcnt(0)
36156; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
36157; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
36158; GCN-NEXT:    v_alignbit_b32 v31, v31, v32, 16
36159; GCN-NEXT:    v_cndmask_b32_e32 v31, v31, v30, vcc
36160; GCN-NEXT:    v_cndmask_b32_e32 v29, v29, v14, vcc
36161; GCN-NEXT:    v_cndmask_b32_e32 v28, v28, v13, vcc
36162; GCN-NEXT:    v_cndmask_b32_e32 v27, v27, v12, vcc
36163; GCN-NEXT:    v_cndmask_b32_e32 v26, v26, v11, vcc
36164; GCN-NEXT:    v_cndmask_b32_e32 v25, v25, v10, vcc
36165; GCN-NEXT:    v_cndmask_b32_e32 v24, v24, v9, vcc
36166; GCN-NEXT:    v_cndmask_b32_e32 v23, v23, v8, vcc
36167; GCN-NEXT:    v_cndmask_b32_e32 v22, v22, v7, vcc
36168; GCN-NEXT:    v_cndmask_b32_e32 v13, v21, v6, vcc
36169; GCN-NEXT:    v_cndmask_b32_e32 v11, v20, v5, vcc
36170; GCN-NEXT:    v_cndmask_b32_e32 v9, v19, v4, vcc
36171; GCN-NEXT:    v_cndmask_b32_e32 v7, v18, v3, vcc
36172; GCN-NEXT:    v_cndmask_b32_e32 v5, v17, v2, vcc
36173; GCN-NEXT:    v_cndmask_b32_e32 v3, v16, v1, vcc
36174; GCN-NEXT:    v_cndmask_b32_e32 v1, v15, v0, vcc
36175; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
36176; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
36177; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
36178; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
36179; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
36180; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
36181; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
36182; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
36183; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
36184; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
36185; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
36186; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
36187; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
36188; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
36189; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v22
36190; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
36191; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
36192; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
36193; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
36194; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v24
36195; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
36196; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v25
36197; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
36198; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v26
36199; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
36200; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v27
36201; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
36202; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v28
36203; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
36204; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
36205; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
36206; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
36207; GCN-NEXT:    s_setpc_b64 s[30:31]
36208;
36209; GFX7-LABEL: v_select_v32bf16:
36210; GFX7:       ; %bb.0:
36211; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36212; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
36213; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36214; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
36215; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
36216; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v4
36217; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36218; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
36219; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
36220; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v6
36221; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
36222; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
36223; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
36224; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v8
36225; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
36226; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
36227; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
36228; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v10
36229; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
36230; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v9
36231; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
36232; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
36233; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:12
36234; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
36235; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
36236; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
36237; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
36238; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
36239; GFX7-NEXT:    v_alignbit_b32 v17, v18, v17, 16
36240; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
36241; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8
36242; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
36243; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
36244; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
36245; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
36246; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
36247; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
36248; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
36249; GFX7-NEXT:    v_alignbit_b32 v27, v28, v27, 16
36250; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
36251; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
36252; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
36253; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
36254; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
36255; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
36256; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
36257; GFX7-NEXT:    v_alignbit_b32 v23, v24, v23, 16
36258; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
36259; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
36260; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
36261; GFX7-NEXT:    v_alignbit_b32 v15, v16, v15, 16
36262; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
36263; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
36264; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
36265; GFX7-NEXT:    v_alignbit_b32 v19, v20, v19, 16
36266; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
36267; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
36268; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
36269; GFX7-NEXT:    v_alignbit_b32 v21, v22, v21, 16
36270; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
36271; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
36272; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
36273; GFX7-NEXT:    v_alignbit_b32 v25, v26, v25, 16
36274; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
36275; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
36276; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
36277; GFX7-NEXT:    v_alignbit_b32 v29, v30, v29, 16
36278; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
36279; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
36280; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
36281; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
36282; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
36283; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
36284; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
36285; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:68
36286; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
36287; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
36288; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
36289; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
36290; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
36291; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
36292; GFX7-NEXT:    s_waitcnt vmcnt(14)
36293; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
36294; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
36295; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
36296; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
36297; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36298; GFX7-NEXT:    s_waitcnt vmcnt(13)
36299; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
36300; GFX7-NEXT:    s_waitcnt vmcnt(12)
36301; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
36302; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
36303; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:20
36304; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
36305; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
36306; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
36307; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
36308; GFX7-NEXT:    s_waitcnt vmcnt(12)
36309; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
36310; GFX7-NEXT:    s_waitcnt vmcnt(11)
36311; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
36312; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
36313; GFX7-NEXT:    s_waitcnt vmcnt(9)
36314; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
36315; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
36316; GFX7-NEXT:    s_waitcnt vmcnt(7)
36317; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
36318; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
36319; GFX7-NEXT:    s_waitcnt vmcnt(6)
36320; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
36321; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
36322; GFX7-NEXT:    s_waitcnt vmcnt(5)
36323; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
36324; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
36325; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
36326; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
36327; GFX7-NEXT:    s_waitcnt vmcnt(4)
36328; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
36329; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
36330; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
36331; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
36332; GFX7-NEXT:    s_waitcnt vmcnt(3)
36333; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
36334; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
36335; GFX7-NEXT:    s_waitcnt vmcnt(1)
36336; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
36337; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
36338; GFX7-NEXT:    s_waitcnt vmcnt(0)
36339; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
36340; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
36341; GFX7-NEXT:    v_alignbit_b32 v7, v7, v8, 16
36342; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
36343; GFX7-NEXT:    s_waitcnt vmcnt(0)
36344; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
36345; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
36346; GFX7-NEXT:    v_alignbit_b32 v8, v8, v9, 16
36347; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36
36348; GFX7-NEXT:    s_waitcnt vmcnt(0)
36349; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
36350; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
36351; GFX7-NEXT:    v_alignbit_b32 v9, v9, v10, 16
36352; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
36353; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v4, vcc
36354; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
36355; GFX7-NEXT:    s_waitcnt vmcnt(0)
36356; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
36357; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
36358; GFX7-NEXT:    v_alignbit_b32 v10, v10, v31, 16
36359; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
36360; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v5, vcc
36361; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v3, vcc
36362; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
36363; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
36364; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
36365; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
36366; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
36367; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
36368; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
36369; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
36370; GFX7-NEXT:    s_waitcnt vmcnt(0)
36371; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36372; GFX7-NEXT:    v_alignbit_b32 v12, v12, v31, 16
36373; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
36374; GFX7-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
36375; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
36376; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
36377; GFX7-NEXT:    s_waitcnt vmcnt(0)
36378; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36379; GFX7-NEXT:    v_alignbit_b32 v14, v14, v31, 16
36380; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
36381; GFX7-NEXT:    v_cndmask_b32_e32 v13, v14, v13, vcc
36382; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
36383; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
36384; GFX7-NEXT:    s_waitcnt vmcnt(0)
36385; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36386; GFX7-NEXT:    v_alignbit_b32 v16, v16, v31, 16
36387; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
36388; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
36389; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
36390; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
36391; GFX7-NEXT:    s_waitcnt vmcnt(0)
36392; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36393; GFX7-NEXT:    v_alignbit_b32 v18, v18, v31, 16
36394; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
36395; GFX7-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
36396; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
36397; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
36398; GFX7-NEXT:    s_waitcnt vmcnt(0)
36399; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36400; GFX7-NEXT:    v_alignbit_b32 v20, v20, v31, 16
36401; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
36402; GFX7-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
36403; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
36404; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
36405; GFX7-NEXT:    s_waitcnt vmcnt(0)
36406; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36407; GFX7-NEXT:    v_alignbit_b32 v22, v22, v31, 16
36408; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
36409; GFX7-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
36410; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
36411; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
36412; GFX7-NEXT:    s_waitcnt vmcnt(0)
36413; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36414; GFX7-NEXT:    v_alignbit_b32 v24, v24, v31, 16
36415; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
36416; GFX7-NEXT:    v_cndmask_b32_e32 v23, v24, v23, vcc
36417; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
36418; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
36419; GFX7-NEXT:    s_waitcnt vmcnt(0)
36420; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36421; GFX7-NEXT:    v_alignbit_b32 v26, v26, v31, 16
36422; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
36423; GFX7-NEXT:    v_cndmask_b32_e32 v25, v26, v25, vcc
36424; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
36425; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
36426; GFX7-NEXT:    s_waitcnt vmcnt(0)
36427; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36428; GFX7-NEXT:    v_alignbit_b32 v28, v28, v31, 16
36429; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
36430; GFX7-NEXT:    v_cndmask_b32_e32 v27, v28, v27, vcc
36431; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
36432; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
36433; GFX7-NEXT:    s_waitcnt vmcnt(0)
36434; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36435; GFX7-NEXT:    v_alignbit_b32 v30, v30, v31, 16
36436; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
36437; GFX7-NEXT:    v_cndmask_b32_e32 v29, v30, v29, vcc
36438; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
36439; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
36440; GFX7-NEXT:    s_waitcnt vmcnt(0)
36441; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
36442; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
36443; GFX7-NEXT:    v_alignbit_b32 v31, v31, v32, 16
36444; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
36445; GFX7-NEXT:    s_waitcnt vmcnt(0)
36446; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
36447; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
36448; GFX7-NEXT:    v_alignbit_b32 v32, v32, v33, 16
36449; GFX7-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
36450; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
36451; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
36452; GFX7-NEXT:    s_setpc_b64 s[30:31]
36453;
36454; GFX8-LABEL: v_select_v32bf16:
36455; GFX8:       ; %bb.0:
36456; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36457; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
36458; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
36459; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
36460; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
36461; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
36462; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
36463; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
36464; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
36465; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
36466; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
36467; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
36468; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
36469; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
36470; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
36471; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
36472; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
36473; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
36474; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
36475; GFX8-NEXT:    s_waitcnt vmcnt(1)
36476; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
36477; GFX8-NEXT:    s_waitcnt vmcnt(0)
36478; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
36479; GFX8-NEXT:    s_setpc_b64 s[30:31]
36480;
36481; GFX9-LABEL: v_select_v32bf16:
36482; GFX9:       ; %bb.0:
36483; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36484; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
36485; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
36486; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
36487; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
36488; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
36489; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
36490; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
36491; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
36492; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
36493; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
36494; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
36495; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
36496; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
36497; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
36498; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
36499; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
36500; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
36501; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
36502; GFX9-NEXT:    s_waitcnt vmcnt(1)
36503; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
36504; GFX9-NEXT:    s_waitcnt vmcnt(0)
36505; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
36506; GFX9-NEXT:    s_setpc_b64 s[30:31]
36507;
36508; GFX10-LABEL: v_select_v32bf16:
36509; GFX10:       ; %bb.0:
36510; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36511; GFX10-NEXT:    s_clause 0x1
36512; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
36513; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
36514; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
36515; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
36516; GFX10-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc_lo
36517; GFX10-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc_lo
36518; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc_lo
36519; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc_lo
36520; GFX10-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc_lo
36521; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc_lo
36522; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc_lo
36523; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc_lo
36524; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc_lo
36525; GFX10-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc_lo
36526; GFX10-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc_lo
36527; GFX10-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc_lo
36528; GFX10-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc_lo
36529; GFX10-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc_lo
36530; GFX10-NEXT:    s_waitcnt vmcnt(1)
36531; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc_lo
36532; GFX10-NEXT:    s_waitcnt vmcnt(0)
36533; GFX10-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc_lo
36534; GFX10-NEXT:    s_setpc_b64 s[30:31]
36535;
36536; GFX11-LABEL: v_select_v32bf16:
36537; GFX11:       ; %bb.0:
36538; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36539; GFX11-NEXT:    s_clause 0x1
36540; GFX11-NEXT:    scratch_load_b32 v31, off, s32
36541; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
36542; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
36543; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
36544; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
36545; GFX11-NEXT:    v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
36546; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
36547; GFX11-NEXT:    v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
36548; GFX11-NEXT:    v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
36549; GFX11-NEXT:    v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
36550; GFX11-NEXT:    v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
36551; GFX11-NEXT:    v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
36552; GFX11-NEXT:    s_waitcnt vmcnt(0)
36553; GFX11-NEXT:    v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
36554; GFX11-NEXT:    s_setpc_b64 s[30:31]
36555  %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
36556  ret <32 x bfloat> %op
36557}
36558
36559define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
36560; GCN-LABEL: s_select_v3bf16:
36561; GCN:       ; %bb.0:
36562; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s1
36563; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s0
36564; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s4
36565; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s3
36566; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s2
36567; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s5
36568; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
36569; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
36570; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
36571; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
36572; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
36573; GCN-NEXT:    v_alignbit_b32 v2, v3, v4, 16
36574; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36575; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
36576; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
36577; GCN-NEXT:    v_readfirstlane_b32 s0, v1
36578; GCN-NEXT:    v_readfirstlane_b32 s1, v0
36579; GCN-NEXT:    ; return to shader part epilog
36580;
36581; GFX7-LABEL: s_select_v3bf16:
36582; GFX7:       ; %bb.0:
36583; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
36584; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
36585; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s0
36586; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
36587; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s4
36588; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36589; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s3
36590; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
36591; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s2
36592; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s5
36593; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
36594; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
36595; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36596; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
36597; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
36598; GFX7-NEXT:    v_readfirstlane_b32 s0, v1
36599; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
36600; GFX7-NEXT:    ; return to shader part epilog
36601;
36602; GFX8-LABEL: s_select_v3bf16:
36603; GFX8:       ; %bb.0:
36604; GFX8-NEXT:    v_mov_b32_e32 v1, s2
36605; GFX8-NEXT:    v_mov_b32_e32 v2, s0
36606; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36607; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
36608; GFX8-NEXT:    v_mov_b32_e32 v1, s3
36609; GFX8-NEXT:    v_mov_b32_e32 v2, s1
36610; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
36611; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
36612; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
36613; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
36614; GFX8-NEXT:    ; return to shader part epilog
36615;
36616; GFX9-LABEL: s_select_v3bf16:
36617; GFX9:       ; %bb.0:
36618; GFX9-NEXT:    v_mov_b32_e32 v1, s2
36619; GFX9-NEXT:    v_mov_b32_e32 v2, s0
36620; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36621; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
36622; GFX9-NEXT:    v_mov_b32_e32 v1, s3
36623; GFX9-NEXT:    v_mov_b32_e32 v2, s1
36624; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
36625; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
36626; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
36627; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
36628; GFX9-NEXT:    ; return to shader part epilog
36629;
36630; GFX10-LABEL: s_select_v3bf16:
36631; GFX10:       ; %bb.0:
36632; GFX10-NEXT:    v_mov_b32_e32 v1, s0
36633; GFX10-NEXT:    v_mov_b32_e32 v2, s1
36634; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36635; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36636; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36637; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
36638; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
36639; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
36640; GFX10-NEXT:    ; return to shader part epilog
36641;
36642; GFX11-LABEL: s_select_v3bf16:
36643; GFX11:       ; %bb.0:
36644; GFX11-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
36645; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36646; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36647; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36648; GFX11-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36649; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36650; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
36651; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
36652; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
36653; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
36654; GFX11-NEXT:    ; return to shader part epilog
36655  %cond = icmp eq i32 %c, 0
36656  %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
36657  %cast = bitcast <3 x bfloat> %op to i48
36658  %elt0 = trunc i48 %cast to i32
36659  %elt1.hi = lshr i48 %cast, 32
36660  %elt1 = trunc i48 %elt1.hi to i32
36661  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36662  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36663  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36664  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36665  ret <2 x i32> %bv.1
36666}
36667
36668define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
36669; GCN-LABEL: s_select_v4bf16:
36670; GCN:       ; %bb.0:
36671; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s1
36672; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s0
36673; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s5
36674; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s4
36675; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s3
36676; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s2
36677; GCN-NEXT:    v_mul_f32_e64 v7, 1.0, s7
36678; GCN-NEXT:    v_mul_f32_e64 v8, 1.0, s6
36679; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
36680; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
36681; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
36682; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
36683; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
36684; GCN-NEXT:    v_alignbit_b32 v2, v3, v4, 16
36685; GCN-NEXT:    v_alignbit_b32 v3, v5, v6, 16
36686; GCN-NEXT:    v_alignbit_b32 v4, v7, v8, 16
36687; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36688; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
36689; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
36690; GCN-NEXT:    v_readfirstlane_b32 s0, v1
36691; GCN-NEXT:    v_readfirstlane_b32 s1, v0
36692; GCN-NEXT:    ; return to shader part epilog
36693;
36694; GFX7-LABEL: s_select_v4bf16:
36695; GFX7:       ; %bb.0:
36696; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
36697; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
36698; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s0
36699; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
36700; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s5
36701; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36702; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s4
36703; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
36704; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s3
36705; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
36706; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s2
36707; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
36708; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s7
36709; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
36710; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s6
36711; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
36712; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36713; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
36714; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
36715; GFX7-NEXT:    v_readfirstlane_b32 s0, v1
36716; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
36717; GFX7-NEXT:    ; return to shader part epilog
36718;
36719; GFX8-LABEL: s_select_v4bf16:
36720; GFX8:       ; %bb.0:
36721; GFX8-NEXT:    v_mov_b32_e32 v1, s3
36722; GFX8-NEXT:    v_mov_b32_e32 v2, s1
36723; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36724; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
36725; GFX8-NEXT:    v_mov_b32_e32 v1, s2
36726; GFX8-NEXT:    v_mov_b32_e32 v2, s0
36727; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
36728; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
36729; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
36730; GFX8-NEXT:    ; return to shader part epilog
36731;
36732; GFX9-LABEL: s_select_v4bf16:
36733; GFX9:       ; %bb.0:
36734; GFX9-NEXT:    v_mov_b32_e32 v1, s3
36735; GFX9-NEXT:    v_mov_b32_e32 v2, s1
36736; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36737; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
36738; GFX9-NEXT:    v_mov_b32_e32 v1, s2
36739; GFX9-NEXT:    v_mov_b32_e32 v2, s0
36740; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
36741; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
36742; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
36743; GFX9-NEXT:    ; return to shader part epilog
36744;
36745; GFX10-LABEL: s_select_v4bf16:
36746; GFX10:       ; %bb.0:
36747; GFX10-NEXT:    v_mov_b32_e32 v1, s1
36748; GFX10-NEXT:    v_mov_b32_e32 v2, s0
36749; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36750; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36751; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36752; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
36753; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
36754; GFX10-NEXT:    ; return to shader part epilog
36755;
36756; GFX11-LABEL: s_select_v4bf16:
36757; GFX11:       ; %bb.0:
36758; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
36759; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36760; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36761; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36762; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36763; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36764; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
36765; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
36766; GFX11-NEXT:    ; return to shader part epilog
36767  %cond = icmp eq i32 %c, 0
36768  %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
36769  %cast = bitcast <4 x bfloat> %op to <2 x i32>
36770  %elt0 = extractelement <2 x i32> %cast, i32 0
36771  %elt1 = extractelement <2 x i32> %cast, i32 1
36772  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36773  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36774  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36775  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36776  ret <2 x i32> %bv.1
36777}
36778
36779define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
36780; GCN-LABEL: s_vselect_v4bf16:
36781; GCN:       ; %bb.0:
36782; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s0
36783; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s4
36784; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s1
36785; GCN-NEXT:    v_mul_f32_e64 v7, 1.0, s5
36786; GCN-NEXT:    v_mul_f32_e64 v8, 1.0, s2
36787; GCN-NEXT:    v_mul_f32_e64 v9, 1.0, s6
36788; GCN-NEXT:    v_mul_f32_e64 v10, 1.0, s3
36789; GCN-NEXT:    v_mul_f32_e64 v11, 1.0, s7
36790; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
36791; GCN-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
36792; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
36793; GCN-NEXT:    v_cndmask_b32_e32 v2, v9, v8, vcc
36794; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
36795; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
36796; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36797; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
36798; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
36799; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36800; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
36801; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
36802; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
36803; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
36804; GCN-NEXT:    v_readfirstlane_b32 s0, v0
36805; GCN-NEXT:    v_readfirstlane_b32 s1, v2
36806; GCN-NEXT:    ; return to shader part epilog
36807;
36808; GFX7-LABEL: s_vselect_v4bf16:
36809; GFX7:       ; %bb.0:
36810; GFX7-NEXT:    v_mul_f32_e64 v10, 1.0, s3
36811; GFX7-NEXT:    v_mul_f32_e64 v11, 1.0, s7
36812; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
36813; GFX7-NEXT:    v_mul_f32_e64 v8, 1.0, s2
36814; GFX7-NEXT:    v_mul_f32_e64 v9, 1.0, s6
36815; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
36816; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
36817; GFX7-NEXT:    v_mul_f32_e64 v6, 1.0, s1
36818; GFX7-NEXT:    v_mul_f32_e64 v7, 1.0, s5
36819; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v8, vcc
36820; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
36821; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s0
36822; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s4
36823; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
36824; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36825; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
36826; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
36827; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
36828; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
36829; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
36830; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
36831; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
36832; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
36833; GFX7-NEXT:    v_readfirstlane_b32 s1, v2
36834; GFX7-NEXT:    ; return to shader part epilog
36835;
36836; GFX8-LABEL: s_vselect_v4bf16:
36837; GFX8:       ; %bb.0:
36838; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
36839; GFX8-NEXT:    s_lshr_b32 s5, s3, 16
36840; GFX8-NEXT:    v_mov_b32_e32 v4, s5
36841; GFX8-NEXT:    v_mov_b32_e32 v5, s4
36842; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
36843; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
36844; GFX8-NEXT:    v_mov_b32_e32 v4, s3
36845; GFX8-NEXT:    v_mov_b32_e32 v5, s1
36846; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
36847; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
36848; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
36849; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
36850; GFX8-NEXT:    s_lshr_b32 s3, s2, 16
36851; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36852; GFX8-NEXT:    v_mov_b32_e32 v3, s3
36853; GFX8-NEXT:    v_mov_b32_e32 v4, s1
36854; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
36855; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
36856; GFX8-NEXT:    v_mov_b32_e32 v3, s2
36857; GFX8-NEXT:    v_mov_b32_e32 v4, s0
36858; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36859; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
36860; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
36861; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36862; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
36863; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
36864; GFX8-NEXT:    ; return to shader part epilog
36865;
36866; GFX9-LABEL: s_vselect_v4bf16:
36867; GFX9:       ; %bb.0:
36868; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
36869; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
36870; GFX9-NEXT:    v_mov_b32_e32 v4, s5
36871; GFX9-NEXT:    v_mov_b32_e32 v5, s4
36872; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
36873; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
36874; GFX9-NEXT:    v_mov_b32_e32 v4, s3
36875; GFX9-NEXT:    v_mov_b32_e32 v5, s1
36876; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
36877; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
36878; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
36879; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
36880; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
36881; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s1
36882; GFX9-NEXT:    v_mov_b32_e32 v3, s4
36883; GFX9-NEXT:    v_mov_b32_e32 v4, s3
36884; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
36885; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
36886; GFX9-NEXT:    v_mov_b32_e32 v3, s2
36887; GFX9-NEXT:    v_mov_b32_e32 v4, s0
36888; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
36889; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
36890; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s1
36891; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
36892; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
36893; GFX9-NEXT:    ; return to shader part epilog
36894;
36895; GFX10-LABEL: s_vselect_v4bf16:
36896; GFX10:       ; %bb.0:
36897; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
36898; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
36899; GFX10-NEXT:    v_mov_b32_e32 v4, s4
36900; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
36901; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
36902; GFX10-NEXT:    v_mov_b32_e32 v6, s0
36903; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
36904; GFX10-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36905; GFX10-NEXT:    v_mov_b32_e32 v4, s5
36906; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
36907; GFX10-NEXT:    v_mov_b32_e32 v5, s1
36908; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36909; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36910; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36911; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
36912; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
36913; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36914; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
36915; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
36916; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
36917; GFX10-NEXT:    ; return to shader part epilog
36918;
36919; GFX11TRUE16-LABEL: s_vselect_v4bf16:
36920; GFX11TRUE16:       ; %bb.0:
36921; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s3, 16
36922; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36923; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
36924; GFX11TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
36925; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
36926; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s3
36927; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
36928; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s0, 16
36929; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 0, v2
36930; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 0, v3
36931; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s8
36932; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s3
36933; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s7
36934; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, s2
36935; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s0
36936; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, s1
36937; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s6
36938; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v2.l, s4
36939; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
36940; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo
36941; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s5
36942; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
36943; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
36944; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
36945; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
36946; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
36947; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
36948; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36949; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
36950; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
36951; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
36952; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
36953; GFX11TRUE16-NEXT:    ; return to shader part epilog
36954;
36955; GFX11FAKE16-LABEL: s_vselect_v4bf16:
36956; GFX11FAKE16:       ; %bb.0:
36957; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
36958; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
36959; GFX11FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
36960; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s3, 16
36961; GFX11FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
36962; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
36963; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36964; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, s5
36965; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
36966; GFX11FAKE16-NEXT:    v_mov_b32_e32 v6, s0
36967; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s2, 16
36968; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
36969; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36970; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
36971; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
36972; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36973; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
36974; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
36975; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36976; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36977; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
36978; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
36979; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
36980; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s1, v1
36981; GFX11FAKE16-NEXT:    ; return to shader part epilog
36982  %cond = icmp eq <4 x i32> %c, zeroinitializer
36983  %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
36984  %cast = bitcast <4 x bfloat> %op to <2 x i32>
36985  %elt0 = extractelement <2 x i32> %cast, i32 0
36986  %elt1 = extractelement <2 x i32> %cast, i32 1
36987  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36988  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36989  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36990  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36991  ret <2 x i32> %bv.1
36992}
36993
36994define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
36995; GCN-LABEL: v_vselect_v4bf16:
36996; GCN:       ; %bb.0:
36997; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36998; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
36999; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37000; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
37001; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
37002; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
37003; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
37004; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
37005; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
37006; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
37007; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37008; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
37009; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
37010; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37011; GCN-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
37012; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37013; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
37014; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37015; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
37016; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37017; GCN-NEXT:    v_cndmask_b32_e32 v0, v8, v4, vcc
37018; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37019; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37020; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
37021; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
37022; GCN-NEXT:    s_setpc_b64 s[30:31]
37023;
37024; GFX7-LABEL: v_vselect_v4bf16:
37025; GFX7:       ; %bb.0:
37026; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37027; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
37028; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
37029; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37030; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
37031; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37032; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
37033; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
37034; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
37035; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
37036; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37037; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
37038; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
37039; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
37040; GFX7-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
37041; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37042; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
37043; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37044; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
37045; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37046; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v4, vcc
37047; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37048; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37049; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
37050; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
37051; GFX7-NEXT:    s_setpc_b64 s[30:31]
37052;
37053; GFX8-LABEL: v_vselect_v4bf16:
37054; GFX8:       ; %bb.0:
37055; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37056; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
37057; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
37058; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
37059; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
37060; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37061; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
37062; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
37063; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37064; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
37065; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
37066; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
37067; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
37068; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37069; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
37070; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37071; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
37072; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
37073; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37074; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
37075; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37076; GFX8-NEXT:    s_setpc_b64 s[30:31]
37077;
37078; GFX9-LABEL: v_vselect_v4bf16:
37079; GFX9:       ; %bb.0:
37080; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37081; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
37082; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
37083; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37084; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
37085; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
37086; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
37087; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
37088; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37089; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
37090; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
37091; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37092; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
37093; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
37094; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
37095; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37096; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
37097; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
37098; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
37099; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
37100; GFX9-NEXT:    s_setpc_b64 s[30:31]
37101;
37102; GFX10-LABEL: v_vselect_v4bf16:
37103; GFX10:       ; %bb.0:
37104; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37105; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
37106; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
37107; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
37108; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
37109; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
37110; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
37111; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
37112; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
37113; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
37114; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
37115; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
37116; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
37117; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37118; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37119; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
37120; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
37121; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37122; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
37123; GFX10-NEXT:    s_setpc_b64 s[30:31]
37124;
37125; GFX11TRUE16-LABEL: v_vselect_v4bf16:
37126; GFX11TRUE16:       ; %bb.0:
37127; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37128; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
37129; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
37130; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
37131; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
37132; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
37133; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
37134; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v2
37135; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37136; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
37137; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
37138; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
37139; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
37140; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
37141; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
37142; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
37143; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
37144; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v3.l, s1
37145; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
37146; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v5.l, s2
37147; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
37148; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
37149; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
37150; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
37151; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
37152; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37153; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
37154; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
37155; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
37156;
37157; GFX11FAKE16-LABEL: v_vselect_v4bf16:
37158; GFX11FAKE16:       ; %bb.0:
37159; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37160; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
37161; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
37162; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
37163; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37164; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
37165; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
37166; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
37167; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
37168; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
37169; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
37170; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
37171; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37172; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37173; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37174; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
37175; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
37176; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37177; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
37178; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
37179; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
37180  %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
37181  ret <4 x bfloat> %op
37182}
37183
37184define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
37185; GCN-LABEL: v_vselect_v8bf16:
37186; GCN:       ; %bb.0:
37187; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37188; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
37189; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
37190; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
37191; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
37192; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
37193; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
37194; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
37195; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
37196; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
37197; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
37198; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
37199; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
37200; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
37201; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
37202; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37203; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
37204; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
37205; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
37206; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
37207; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
37208; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
37209; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
37210; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
37211; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
37212; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
37213; GCN-NEXT:    v_cndmask_b32_e32 v7, v23, v15, vcc
37214; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37215; GCN-NEXT:    v_cndmask_b32_e32 v6, v22, v14, vcc
37216; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37217; GCN-NEXT:    v_cndmask_b32_e32 v5, v21, v13, vcc
37218; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37219; GCN-NEXT:    v_cndmask_b32_e32 v4, v20, v12, vcc
37220; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37221; GCN-NEXT:    v_cndmask_b32_e32 v3, v19, v11, vcc
37222; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37223; GCN-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
37224; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37225; GCN-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
37226; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37227; GCN-NEXT:    v_cndmask_b32_e32 v0, v16, v8, vcc
37228; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37229; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37230; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
37231; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
37232; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
37233; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
37234; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
37235; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
37236; GCN-NEXT:    s_setpc_b64 s[30:31]
37237;
37238; GFX7-LABEL: v_vselect_v8bf16:
37239; GFX7:       ; %bb.0:
37240; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37241; GFX7-NEXT:    v_and_b32_e32 v7, 1, v7
37242; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
37243; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
37244; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
37245; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
37246; GFX7-NEXT:    v_cndmask_b32_e32 v7, v23, v15, vcc
37247; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
37248; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v22
37249; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37250; GFX7-NEXT:    v_and_b32_e32 v5, 1, v5
37251; GFX7-NEXT:    v_cndmask_b32_e32 v6, v15, v14, vcc
37252; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
37253; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v21
37254; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37255; GFX7-NEXT:    v_and_b32_e32 v4, 1, v4
37256; GFX7-NEXT:    v_cndmask_b32_e32 v5, v14, v13, vcc
37257; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
37258; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v20
37259; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37260; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
37261; GFX7-NEXT:    v_cndmask_b32_e32 v4, v13, v12, vcc
37262; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
37263; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v19
37264; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37265; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
37266; GFX7-NEXT:    v_cndmask_b32_e32 v3, v12, v11, vcc
37267; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
37268; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
37269; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v18
37270; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37271; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
37272; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
37273; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v17
37274; GFX7-NEXT:    v_cndmask_b32_e32 v2, v13, v10, vcc
37275; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37276; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37277; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v16
37278; GFX7-NEXT:    v_cndmask_b32_e32 v1, v12, v9, vcc
37279; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37280; GFX7-NEXT:    v_cndmask_b32_e32 v0, v11, v8, vcc
37281; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37282; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37283; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
37284; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
37285; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
37286; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
37287; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
37288; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
37289; GFX7-NEXT:    s_setpc_b64 s[30:31]
37290;
37291; GFX8-LABEL: v_vselect_v8bf16:
37292; GFX8:       ; %bb.0:
37293; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37294; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
37295; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
37296; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
37297; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
37298; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
37299; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
37300; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
37301; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37302; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
37303; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
37304; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
37305; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
37306; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37307; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
37308; GFX8-NEXT:    v_cndmask_b32_e32 v5, v15, v11, vcc
37309; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37310; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
37311; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
37312; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
37313; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
37314; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37315; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
37316; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
37317; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37318; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
37319; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
37320; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
37321; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
37322; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37323; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
37324; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37325; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
37326; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
37327; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37328; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
37329; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37330; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
37331; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
37332; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37333; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37334; GFX8-NEXT:    s_setpc_b64 s[30:31]
37335;
37336; GFX9-LABEL: v_vselect_v8bf16:
37337; GFX9:       ; %bb.0:
37338; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37339; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
37340; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
37341; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37342; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
37343; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
37344; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
37345; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
37346; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
37347; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
37348; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
37349; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37350; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
37351; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
37352; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
37353; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
37354; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37355; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
37356; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
37357; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37358; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
37359; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
37360; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
37361; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
37362; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37363; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
37364; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
37365; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37366; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
37367; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
37368; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
37369; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37370; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
37371; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
37372; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
37373; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
37374; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
37375; GFX9-NEXT:    v_perm_b32 v3, v7, v6, s4
37376; GFX9-NEXT:    s_setpc_b64 s[30:31]
37377;
37378; GFX10-LABEL: v_vselect_v8bf16:
37379; GFX10:       ; %bb.0:
37380; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37381; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
37382; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
37383; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
37384; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
37385; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
37386; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
37387; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
37388; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
37389; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
37390; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
37391; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
37392; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
37393; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
37394; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
37395; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
37396; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
37397; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
37398; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
37399; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
37400; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37401; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
37402; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
37403; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
37404; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
37405; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
37406; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37407; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37408; GFX10-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37409; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
37410; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
37411; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37412; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
37413; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
37414; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37415; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
37416; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
37417; GFX10-NEXT:    s_setpc_b64 s[30:31]
37418;
37419; GFX11TRUE16-LABEL: v_vselect_v8bf16:
37420; GFX11TRUE16:       ; %bb.0:
37421; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37422; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
37423; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
37424; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
37425; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
37426; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
37427; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37428; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
37429; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v7
37430; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v6
37431; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
37432; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v5
37433; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
37434; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
37435; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
37436; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
37437; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
37438; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
37439; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
37440; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
37441; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v1.l, s2
37442; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
37443; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
37444; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
37445; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
37446; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
37447; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v15.l, v11.l, s3
37448; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v14.l, v10.l, s4
37449; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo
37450; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v8.l, s0
37451; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v5.l, v4.l, s1
37452; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v13.l, v9.l, s5
37453; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.l, v6.l, s6
37454; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
37455; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
37456; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
37457; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
37458; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
37459; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
37460; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.h
37461; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
37462; GFX11TRUE16-NEXT:    v_perm_b32 v0, v4, v5, 0x5040100
37463; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v6, 0x5040100
37464; GFX11TRUE16-NEXT:    v_perm_b32 v2, v3, v7, 0x5040100
37465; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
37466; GFX11TRUE16-NEXT:    v_perm_b32 v3, v8, v9, 0x5040100
37467; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
37468;
37469; GFX11FAKE16-LABEL: v_vselect_v8bf16:
37470; GFX11FAKE16:       ; %bb.0:
37471; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37472; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
37473; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
37474; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
37475; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
37476; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
37477; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37478; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
37479; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
37480; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
37481; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
37482; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
37483; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
37484; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
37485; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
37486; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
37487; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
37488; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
37489; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37490; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
37491; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
37492; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
37493; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
37494; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
37495; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
37496; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37497; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
37498; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37499; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
37500; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37501; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
37502; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37503; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
37504; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
37505; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37506; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
37507; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
37508; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
37509; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
37510  %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
37511  ret <8 x bfloat> %op
37512}
37513
37514define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
37515; GCN-LABEL: v_vselect_v16bf16:
37516; GCN:       ; %bb.0:
37517; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37518; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
37519; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
37520; GCN-NEXT:    s_mov_b64 exec, s[4:5]
37521; GCN-NEXT:    s_waitcnt expcnt(0)
37522; GCN-NEXT:    v_writelane_b32 v31, s30, 0
37523; GCN-NEXT:    v_writelane_b32 v31, s31, 1
37524; GCN-NEXT:    v_writelane_b32 v31, s34, 2
37525; GCN-NEXT:    v_writelane_b32 v31, s35, 3
37526; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
37527; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37528; GCN-NEXT:    v_and_b32_e32 v0, 1, v1
37529; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
37530; GCN-NEXT:    v_and_b32_e32 v0, 1, v2
37531; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
37532; GCN-NEXT:    v_and_b32_e32 v0, 1, v3
37533; GCN-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
37534; GCN-NEXT:    v_and_b32_e32 v0, 1, v4
37535; GCN-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
37536; GCN-NEXT:    v_and_b32_e32 v0, 1, v5
37537; GCN-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
37538; GCN-NEXT:    v_and_b32_e32 v0, 1, v6
37539; GCN-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
37540; GCN-NEXT:    v_and_b32_e32 v0, 1, v7
37541; GCN-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
37542; GCN-NEXT:    v_and_b32_e32 v0, 1, v8
37543; GCN-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
37544; GCN-NEXT:    v_and_b32_e32 v0, 1, v9
37545; GCN-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
37546; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v16
37547; GCN-NEXT:    v_and_b32_e32 v1, 1, v10
37548; GCN-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v1
37549; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
37550; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v17
37551; GCN-NEXT:    v_and_b32_e32 v2, 1, v11
37552; GCN-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v2
37553; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8
37554; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v18
37555; GCN-NEXT:    v_and_b32_e32 v3, 1, v12
37556; GCN-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v3
37557; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:12
37558; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v19
37559; GCN-NEXT:    v_and_b32_e32 v7, 1, v13
37560; GCN-NEXT:    v_and_b32_e32 v8, 1, v14
37561; GCN-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v7
37562; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32
37563; GCN-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v8
37564; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
37565; GCN-NEXT:    v_and_b32_e32 v9, 1, v15
37566; GCN-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v9
37567; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:60
37568; GCN-NEXT:    s_waitcnt vmcnt(2)
37569; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37570; GCN-NEXT:    s_waitcnt vmcnt(1)
37571; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37572; GCN-NEXT:    v_cndmask_b32_e64 v15, v8, v7, s[34:35]
37573; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:56
37574; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v30
37575; GCN-NEXT:    s_waitcnt vmcnt(1)
37576; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
37577; GCN-NEXT:    v_cndmask_b32_e64 v14, v9, v8, s[30:31]
37578; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52
37579; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v29
37580; GCN-NEXT:    s_waitcnt vmcnt(1)
37581; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37582; GCN-NEXT:    v_cndmask_b32_e64 v13, v7, v9, s[28:29]
37583; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
37584; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v28
37585; GCN-NEXT:    s_waitcnt vmcnt(1)
37586; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37587; GCN-NEXT:    v_cndmask_b32_e64 v12, v8, v9, s[26:27]
37588; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:44
37589; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v27
37590; GCN-NEXT:    s_waitcnt vmcnt(1)
37591; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37592; GCN-NEXT:    v_cndmask_b32_e64 v11, v7, v9, s[24:25]
37593; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40
37594; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v26
37595; GCN-NEXT:    s_waitcnt vmcnt(1)
37596; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37597; GCN-NEXT:    v_cndmask_b32_e64 v10, v8, v9, s[22:23]
37598; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
37599; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v25
37600; GCN-NEXT:    s_waitcnt vmcnt(1)
37601; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37602; GCN-NEXT:    v_cndmask_b32_e64 v9, v7, v9, s[20:21]
37603; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
37604; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v24
37605; GCN-NEXT:    s_waitcnt vmcnt(1)
37606; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37607; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s[18:19]
37608; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
37609; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v23
37610; GCN-NEXT:    s_waitcnt vmcnt(1)
37611; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37612; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v17, s[16:17]
37613; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:24
37614; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v22
37615; GCN-NEXT:    s_waitcnt vmcnt(1)
37616; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
37617; GCN-NEXT:    v_cndmask_b32_e64 v16, v16, v18, s[14:15]
37618; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
37619; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
37620; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v21
37621; GCN-NEXT:    s_waitcnt vmcnt(1)
37622; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
37623; GCN-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s[12:13]
37624; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
37625; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
37626; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
37627; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
37628; GCN-NEXT:    s_waitcnt vmcnt(1)
37629; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
37630; GCN-NEXT:    s_waitcnt vmcnt(0)
37631; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
37632; GCN-NEXT:    v_cndmask_b32_e64 v19, v20, v19, s[10:11]
37633; GCN-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[8:9]
37634; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
37635; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
37636; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
37637; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37638; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37639; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
37640; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
37641; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
37642; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v17
37643; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v16
37644; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
37645; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
37646; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
37647; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
37648; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
37649; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
37650; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
37651; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
37652; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
37653; GCN-NEXT:    v_readlane_b32 s35, v31, 3
37654; GCN-NEXT:    v_readlane_b32 s34, v31, 2
37655; GCN-NEXT:    v_readlane_b32 s31, v31, 1
37656; GCN-NEXT:    v_readlane_b32 s30, v31, 0
37657; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
37658; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
37659; GCN-NEXT:    s_mov_b64 exec, s[4:5]
37660; GCN-NEXT:    s_waitcnt vmcnt(0)
37661; GCN-NEXT:    s_setpc_b64 s[30:31]
37662;
37663; GFX7-LABEL: v_vselect_v16bf16:
37664; GFX7:       ; %bb.0:
37665; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37666; GFX7-NEXT:    v_and_b32_e32 v8, 1, v8
37667; GFX7-NEXT:    v_and_b32_e32 v7, 1, v7
37668; GFX7-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v8
37669; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v7
37670; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32
37671; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
37672; GFX7-NEXT:    v_and_b32_e32 v15, 1, v15
37673; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v15
37674; GFX7-NEXT:    v_and_b32_e32 v14, 1, v14
37675; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v14
37676; GFX7-NEXT:    v_and_b32_e32 v13, 1, v13
37677; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v13
37678; GFX7-NEXT:    v_and_b32_e32 v12, 1, v12
37679; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v12
37680; GFX7-NEXT:    v_and_b32_e32 v11, 1, v11
37681; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v11
37682; GFX7-NEXT:    v_and_b32_e32 v10, 1, v10
37683; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
37684; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
37685; GFX7-NEXT:    v_and_b32_e32 v5, 1, v5
37686; GFX7-NEXT:    v_and_b32_e32 v9, 1, v9
37687; GFX7-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v9
37688; GFX7-NEXT:    v_and_b32_e32 v4, 1, v4
37689; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
37690; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
37691; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
37692; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
37693; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
37694; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
37695; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
37696; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
37697; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
37698; GFX7-NEXT:    s_waitcnt vmcnt(1)
37699; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
37700; GFX7-NEXT:    s_waitcnt vmcnt(0)
37701; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37702; GFX7-NEXT:    v_cndmask_b32_e64 v15, v8, v7, s[12:13]
37703; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:60
37704; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v30
37705; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
37706; GFX7-NEXT:    s_waitcnt vmcnt(0)
37707; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37708; GFX7-NEXT:    v_cndmask_b32_e64 v14, v8, v7, s[10:11]
37709; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
37710; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v29
37711; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
37712; GFX7-NEXT:    s_waitcnt vmcnt(0)
37713; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37714; GFX7-NEXT:    v_cndmask_b32_e64 v13, v8, v7, s[8:9]
37715; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52
37716; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v28
37717; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
37718; GFX7-NEXT:    s_waitcnt vmcnt(0)
37719; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37720; GFX7-NEXT:    v_cndmask_b32_e64 v12, v8, v7, s[6:7]
37721; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
37722; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v27
37723; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
37724; GFX7-NEXT:    s_waitcnt vmcnt(0)
37725; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37726; GFX7-NEXT:    v_cndmask_b32_e64 v11, v8, v7, s[4:5]
37727; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:44
37728; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v26
37729; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
37730; GFX7-NEXT:    s_waitcnt vmcnt(0)
37731; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37732; GFX7-NEXT:    v_cndmask_b32_e32 v10, v8, v7, vcc
37733; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37734; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v22
37735; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
37736; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
37737; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v25
37738; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
37739; GFX7-NEXT:    s_waitcnt vmcnt(1)
37740; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
37741; GFX7-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
37742; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37743; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v21
37744; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:24
37745; GFX7-NEXT:    s_waitcnt vmcnt(1)
37746; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37747; GFX7-NEXT:    v_cndmask_b32_e64 v9, v8, v7, s[18:19]
37748; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
37749; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v24
37750; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
37751; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
37752; GFX7-NEXT:    s_waitcnt vmcnt(1)
37753; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
37754; GFX7-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
37755; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37756; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
37757; GFX7-NEXT:    s_waitcnt vmcnt(1)
37758; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
37759; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v7, s[16:17]
37760; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v23
37761; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
37762; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
37763; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
37764; GFX7-NEXT:    s_waitcnt vmcnt(1)
37765; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
37766; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
37767; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
37768; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37769; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
37770; GFX7-NEXT:    s_waitcnt vmcnt(2)
37771; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
37772; GFX7-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[14:15]
37773; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
37774; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
37775; GFX7-NEXT:    s_waitcnt vmcnt(1)
37776; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
37777; GFX7-NEXT:    v_cndmask_b32_e32 v19, v20, v19, vcc
37778; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
37779; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37780; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
37781; GFX7-NEXT:    s_waitcnt vmcnt(2)
37782; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
37783; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc
37784; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37785; GFX7-NEXT:    s_waitcnt vmcnt(1)
37786; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v20
37787; GFX7-NEXT:    s_waitcnt vmcnt(0)
37788; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
37789; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v17, vcc
37790; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37791; GFX7-NEXT:    v_cndmask_b32_e32 v0, v18, v16, vcc
37792; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
37793; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
37794; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
37795; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v19
37796; GFX7-NEXT:    s_setpc_b64 s[30:31]
37797;
37798; GFX8-LABEL: v_vselect_v16bf16:
37799; GFX8:       ; %bb.0:
37800; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37801; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
37802; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
37803; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
37804; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
37805; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37806; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
37807; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
37808; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
37809; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
37810; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
37811; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
37812; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
37813; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
37814; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
37815; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
37816; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
37817; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
37818; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
37819; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
37820; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
37821; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
37822; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
37823; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
37824; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
37825; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
37826; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
37827; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
37828; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
37829; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
37830; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
37831; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
37832; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
37833; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
37834; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
37835; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
37836; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
37837; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
37838; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
37839; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
37840; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
37841; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
37842; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v0, s[28:29]
37843; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
37844; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
37845; GFX8-NEXT:    v_cndmask_b32_e64 v5, v1, v0, s[24:25]
37846; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
37847; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
37848; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, v0, s[20:21]
37849; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
37850; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
37851; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
37852; GFX8-NEXT:    v_cndmask_b32_e64 v7, v30, v22, s[26:27]
37853; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
37854; GFX8-NEXT:    v_cndmask_b32_e64 v8, v29, v21, s[22:23]
37855; GFX8-NEXT:    v_cndmask_b32_e64 v9, v28, v20, s[18:19]
37856; GFX8-NEXT:    v_cndmask_b32_e64 v12, v27, v19, s[14:15]
37857; GFX8-NEXT:    v_cndmask_b32_e64 v13, v26, v18, s[10:11]
37858; GFX8-NEXT:    v_cndmask_b32_e64 v14, v25, v17, s[6:7]
37859; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
37860; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
37861; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37862; GFX8-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37863; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37864; GFX8-NEXT:    s_waitcnt vmcnt(0)
37865; GFX8-NEXT:    v_cndmask_b32_e64 v10, v0, v23, s[30:31]
37866; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
37867; GFX8-NEXT:    v_cndmask_b32_e64 v11, v0, v1, s[34:35]
37868; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
37869; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
37870; GFX8-NEXT:    v_cndmask_b32_e64 v3, v1, v0, s[16:17]
37871; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
37872; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
37873; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[12:13]
37874; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
37875; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
37876; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[8:9]
37877; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
37878; GFX8-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s[4:5]
37879; GFX8-NEXT:    v_cndmask_b32_e32 v15, v24, v16, vcc
37880; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
37881; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
37882; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
37883; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
37884; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
37885; GFX8-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37886; GFX8-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37887; GFX8-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37888; GFX8-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37889; GFX8-NEXT:    v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37890; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
37891; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
37892; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
37893; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
37894; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
37895; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
37896; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
37897; GFX8-NEXT:    s_waitcnt vmcnt(0)
37898; GFX8-NEXT:    s_setpc_b64 s[30:31]
37899;
37900; GFX9-LABEL: v_vselect_v16bf16:
37901; GFX9:       ; %bb.0:
37902; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37903; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
37904; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
37905; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
37906; GFX9-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc
37907; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
37908; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
37909; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
37910; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
37911; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v22, vcc
37912; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
37913; GFX9-NEXT:    v_and_b32_e32 v10, 1, v11
37914; GFX9-NEXT:    v_cndmask_b32_e32 v11, v29, v21, vcc
37915; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
37916; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v29
37917; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
37918; GFX9-NEXT:    v_cndmask_b32_e32 v10, v22, v21, vcc
37919; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32
37920; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
37921; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
37922; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
37923; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
37924; GFX9-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
37925; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v28
37926; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
37927; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
37928; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
37929; GFX9-NEXT:    v_cndmask_b32_e32 v8, v22, v8, vcc
37930; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
37931; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
37932; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
37933; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
37934; GFX9-NEXT:    v_cndmask_b32_e32 v19, v27, v19, vcc
37935; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
37936; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
37937; GFX9-NEXT:    v_cndmask_b32_e32 v9, v22, v9, vcc
37938; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
37939; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
37940; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
37941; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
37942; GFX9-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
37943; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
37944; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
37945; GFX9-NEXT:    v_cndmask_b32_e32 v5, v27, v6, vcc
37946; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
37947; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
37948; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
37949; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
37950; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
37951; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
37952; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
37953; GFX9-NEXT:    s_waitcnt vmcnt(0)
37954; GFX9-NEXT:    v_cndmask_b32_e32 v14, v21, v23, vcc
37955; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v21
37956; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
37957; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
37958; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
37959; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
37960; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
37961; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
37962; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
37963; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v6, vcc
37964; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
37965; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
37966; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
37967; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
37968; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
37969; GFX9-NEXT:    v_cndmask_b32_e32 v1, v15, v6, vcc
37970; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
37971; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
37972; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
37973; GFX9-NEXT:    v_perm_b32 v3, v9, v19, s4
37974; GFX9-NEXT:    v_perm_b32 v4, v8, v20, s4
37975; GFX9-NEXT:    v_perm_b32 v5, v10, v11, s4
37976; GFX9-NEXT:    v_perm_b32 v6, v13, v12, s4
37977; GFX9-NEXT:    v_perm_b32 v7, v7, v14, s4
37978; GFX9-NEXT:    s_setpc_b64 s[30:31]
37979;
37980; GFX10-LABEL: v_vselect_v16bf16:
37981; GFX10:       ; %bb.0:
37982; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37983; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
37984; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
37985; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
37986; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
37987; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
37988; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
37989; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
37990; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
37991; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
37992; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
37993; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
37994; GFX10-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc_lo
37995; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
37996; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
37997; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
37998; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
37999; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
38000; GFX10-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc_lo
38001; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
38002; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
38003; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
38004; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
38005; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
38006; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
38007; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
38008; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
38009; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
38010; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
38011; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
38012; GFX10-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
38013; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
38014; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
38015; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v24
38016; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
38017; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
38018; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
38019; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
38020; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
38021; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
38022; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
38023; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
38024; GFX10-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
38025; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
38026; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
38027; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
38028; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
38029; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
38030; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
38031; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
38032; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
38033; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
38034; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v51, vcc_lo
38035; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
38036; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
38037; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
38038; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v30, vcc_lo
38039; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
38040; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
38041; GFX10-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
38042; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
38043; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
38044; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
38045; GFX10-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38046; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
38047; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
38048; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
38049; GFX10-NEXT:    s_waitcnt vmcnt(0)
38050; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
38051; GFX10-NEXT:    v_cndmask_b32_e32 v12, v31, v23, vcc_lo
38052; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
38053; GFX10-NEXT:    v_cndmask_b32_e32 v13, v3, v32, vcc_lo
38054; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
38055; GFX10-NEXT:    v_perm_b32 v6, v33, v22, 0x5040100
38056; GFX10-NEXT:    v_perm_b32 v7, v13, v12, 0x5040100
38057; GFX10-NEXT:    s_setpc_b64 s[30:31]
38058;
38059; GFX11TRUE16-LABEL: v_vselect_v16bf16:
38060; GFX11TRUE16:       ; %bb.0:
38061; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38062; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
38063; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
38064; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
38065; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
38066; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
38067; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
38068; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
38069; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
38070; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
38071; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
38072; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
38073; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
38074; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
38075; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
38076; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
38077; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
38078; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
38079; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
38080; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
38081; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
38082; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
38083; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
38084; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
38085; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
38086; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
38087; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
38088; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
38089; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
38090; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
38091; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
38092; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v28.l, v20.l, s8
38093; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v38.l, v37.l, s7
38094; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
38095; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
38096; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
38097; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
38098; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
38099; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
38100; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
38101; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
38102; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
38103; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
38104; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
38105; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
38106; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
38107; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
38108; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
38109; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
38110; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
38111; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
38112; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v27.l, v19.l, s6
38113; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v48.l, v39.l, s5
38114; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo
38115; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v24.l, v16.l, s0
38116; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.h
38117; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
38118; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v30.l, v22.l, s10
38119; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v33.l, s11
38120; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v29.l, v21.l, s12
38121; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v36.l, v35.l, s9
38122; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v52.l, v51.l, s1
38123; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v25.l, v17.l, s2
38124; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v50.l, v49.l, s3
38125; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
38126; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
38127; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.h
38128; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
38129; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v26.l, v18.l, s4
38130; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
38131; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
38132; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
38133; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
38134; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v1.l
38135; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v0.h
38136; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
38137; GFX11TRUE16-NEXT:    v_perm_b32 v0, v7, v8, 0x5040100
38138; GFX11TRUE16-NEXT:    v_perm_b32 v1, v5, v9, 0x5040100
38139; GFX11TRUE16-NEXT:    v_perm_b32 v5, v14, v15, 0x5040100
38140; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
38141; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
38142; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v31.l, v23.l, s14
38143; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
38144; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, v32.l, s13
38145; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.l
38146; GFX11TRUE16-NEXT:    v_perm_b32 v2, v6, v4, 0x5040100
38147; GFX11TRUE16-NEXT:    v_perm_b32 v4, v12, v13, 0x5040100
38148; GFX11TRUE16-NEXT:    v_perm_b32 v6, v16, v17, 0x5040100
38149; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
38150; GFX11TRUE16-NEXT:    v_perm_b32 v3, v10, v11, 0x5040100
38151; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
38152; GFX11TRUE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
38153; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
38154;
38155; GFX11FAKE16-LABEL: v_vselect_v16bf16:
38156; GFX11FAKE16:       ; %bb.0:
38157; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38158; GFX11FAKE16-NEXT:    scratch_load_b32 v31, off, s32
38159; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
38160; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
38161; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
38162; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
38163; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
38164; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
38165; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
38166; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
38167; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
38168; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
38169; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
38170; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
38171; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
38172; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
38173; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
38174; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
38175; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
38176; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
38177; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
38178; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
38179; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
38180; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
38181; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
38182; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
38183; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
38184; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
38185; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
38186; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
38187; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
38188; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
38189; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
38190; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
38191; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
38192; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
38193; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
38194; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
38195; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
38196; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
38197; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
38198; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
38199; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
38200; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
38201; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
38202; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
38203; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
38204; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
38205; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
38206; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
38207; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
38208; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
38209; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
38210; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
38211; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
38212; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
38213; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
38214; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
38215; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
38216; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
38217; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
38218; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38219; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
38220; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
38221; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
38222; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
38223; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
38224; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
38225; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
38226; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
38227; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
38228; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
38229; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
38230; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
38231; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
38232; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
38233; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
38234  %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
38235  ret <16 x bfloat> %op
38236}
38237
38238define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
38239; GCN-LABEL: v_vselect_v32bf16:
38240; GCN:       ; %bb.0:
38241; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38242; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
38243; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
38244; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
38245; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
38246; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
38247; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
38248; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
38249; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
38250; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
38251; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
38252; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
38253; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
38254; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
38255; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
38256; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
38257; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
38258; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
38259; GCN-NEXT:    v_and_b32_e32 v36, 1, v13
38260; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
38261; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
38262; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
38263; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184
38264; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
38265; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
38266; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
38267; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192
38268; GCN-NEXT:    v_and_b32_e32 v53, 1, v26
38269; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
38270; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
38271; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
38272; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
38273; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
38274; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
38275; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
38276; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:112
38277; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
38278; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
38279; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
38280; GCN-NEXT:    v_and_b32_e32 v30, 1, v30
38281; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
38282; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
38283; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
38284; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
38285; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:252
38286; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:248
38287; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:244
38288; GCN-NEXT:    s_waitcnt expcnt(6)
38289; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:240
38290; GCN-NEXT:    s_waitcnt vmcnt(14)
38291; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v37
38292; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v38
38293; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
38294; GCN-NEXT:    s_waitcnt vmcnt(5)
38295; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v43
38296; GCN-NEXT:    s_waitcnt vmcnt(3)
38297; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v44
38298; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
38299; GCN-NEXT:    v_cndmask_b32_e64 v30, v37, v36, s[4:5]
38300; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:236
38301; GCN-NEXT:    s_waitcnt expcnt(5)
38302; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:232
38303; GCN-NEXT:    s_waitcnt expcnt(4)
38304; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:228
38305; GCN-NEXT:    s_waitcnt expcnt(3)
38306; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:224
38307; GCN-NEXT:    s_waitcnt expcnt(2)
38308; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:220
38309; GCN-NEXT:    s_waitcnt expcnt(1)
38310; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:216
38311; GCN-NEXT:    s_waitcnt expcnt(0)
38312; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:212
38313; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:128
38314; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
38315; GCN-NEXT:    s_waitcnt vmcnt(10)
38316; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v45
38317; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
38318; GCN-NEXT:    s_waitcnt vmcnt(9)
38319; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v46
38320; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
38321; GCN-NEXT:    s_waitcnt vmcnt(8)
38322; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v47
38323; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
38324; GCN-NEXT:    s_waitcnt vmcnt(7)
38325; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
38326; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
38327; GCN-NEXT:    v_cndmask_b32_e64 v29, v43, v42, s[4:5]
38328; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
38329; GCN-NEXT:    v_cndmask_b32_e64 v28, v44, v41, s[4:5]
38330; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v27
38331; GCN-NEXT:    v_cndmask_b32_e64 v27, v45, v55, s[4:5]
38332; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v53
38333; GCN-NEXT:    v_cndmask_b32_e64 v36, v36, v54, s[4:5]
38334; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:4
38335; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:132
38336; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:8
38337; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136
38338; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
38339; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140
38340; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
38341; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:144
38342; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
38343; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
38344; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
38345; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
38346; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
38347; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
38348; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
38349; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
38350; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
38351; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
38352; GCN-NEXT:    s_waitcnt vmcnt(14)
38353; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v56
38354; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
38355; GCN-NEXT:    s_waitcnt vmcnt(13)
38356; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v57
38357; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
38358; GCN-NEXT:    s_waitcnt vmcnt(12)
38359; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v58
38360; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
38361; GCN-NEXT:    s_waitcnt vmcnt(11)
38362; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v59
38363; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v25
38364; GCN-NEXT:    v_cndmask_b32_e64 v25, v46, v52, s[4:5]
38365; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
38366; GCN-NEXT:    v_cndmask_b32_e64 v24, v47, v51, s[4:5]
38367; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v23
38368; GCN-NEXT:    v_cndmask_b32_e64 v23, v56, v50, s[4:5]
38369; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
38370; GCN-NEXT:    v_cndmask_b32_e64 v22, v57, v49, s[4:5]
38371; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
38372; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
38373; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
38374; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
38375; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:76
38376; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204
38377; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80
38378; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:208
38379; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
38380; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
38381; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
38382; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
38383; GCN-NEXT:    s_waitcnt vmcnt(14)
38384; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v60
38385; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
38386; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v61
38387; GCN-NEXT:    s_waitcnt vmcnt(3)
38388; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
38389; GCN-NEXT:    s_waitcnt vmcnt(2)
38390; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
38391; GCN-NEXT:    s_waitcnt vmcnt(1)
38392; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
38393; GCN-NEXT:    s_waitcnt vmcnt(0)
38394; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
38395; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
38396; GCN-NEXT:    v_cndmask_b32_e64 v21, v58, v48, s[4:5]
38397; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
38398; GCN-NEXT:    v_cndmask_b32_e64 v20, v59, v39, s[4:5]
38399; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v19
38400; GCN-NEXT:    v_cndmask_b32_e64 v19, v57, v56, s[4:5]
38401; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
38402; GCN-NEXT:    v_cndmask_b32_e64 v18, v47, v46, s[4:5]
38403; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
38404; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:148
38405; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24
38406; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152
38407; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28
38408; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156
38409; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32
38410; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:160
38411; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
38412; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
38413; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
38414; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
38415; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
38416; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
38417; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
38418; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
38419; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38420; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
38421; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
38422; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
38423; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
38424; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
38425; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
38426; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
38427; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
38428; GCN-NEXT:    v_cndmask_b32_e64 v17, v52, v51, s[4:5]
38429; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
38430; GCN-NEXT:    v_cndmask_b32_e64 v16, v50, v49, s[4:5]
38431; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v15
38432; GCN-NEXT:    v_cndmask_b32_e64 v15, v35, v34, s[4:5]
38433; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
38434; GCN-NEXT:    v_cndmask_b32_e64 v14, v33, v32, s[4:5]
38435; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
38436; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
38437; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40
38438; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168
38439; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:44
38440; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:172
38441; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:48
38442; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176
38443; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
38444; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
38445; GCN-NEXT:    v_cndmask_b32_e32 v38, v38, v40, vcc
38446; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:256
38447; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
38448; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
38449; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
38450; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
38451; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
38452; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
38453; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v43
38454; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
38455; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
38456; GCN-NEXT:    s_waitcnt vmcnt(14)
38457; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
38458; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
38459; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
38460; GCN-NEXT:    s_waitcnt vmcnt(13)
38461; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
38462; GCN-NEXT:    s_waitcnt vmcnt(12)
38463; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
38464; GCN-NEXT:    s_waitcnt vmcnt(11)
38465; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
38466; GCN-NEXT:    s_waitcnt vmcnt(10)
38467; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
38468; GCN-NEXT:    s_waitcnt vmcnt(9)
38469; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
38470; GCN-NEXT:    s_waitcnt vmcnt(8)
38471; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38472; GCN-NEXT:    s_waitcnt vmcnt(7)
38473; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
38474; GCN-NEXT:    s_waitcnt vmcnt(6)
38475; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
38476; GCN-NEXT:    s_waitcnt vmcnt(5)
38477; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
38478; GCN-NEXT:    s_waitcnt vmcnt(4)
38479; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
38480; GCN-NEXT:    s_waitcnt vmcnt(3)
38481; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
38482; GCN-NEXT:    s_waitcnt vmcnt(2)
38483; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
38484; GCN-NEXT:    s_waitcnt vmcnt(1)
38485; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
38486; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
38487; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
38488; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
38489; GCN-NEXT:    s_waitcnt vmcnt(0)
38490; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
38491; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
38492; GCN-NEXT:    v_cndmask_b32_e32 v12, v31, v13, vcc
38493; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
38494; GCN-NEXT:    v_cndmask_b32_e32 v11, v52, v51, vcc
38495; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
38496; GCN-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc
38497; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
38498; GCN-NEXT:    v_cndmask_b32_e32 v9, v35, v34, vcc
38499; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
38500; GCN-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
38501; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
38502; GCN-NEXT:    v_cndmask_b32_e32 v7, v59, v58, vcc
38503; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
38504; GCN-NEXT:    v_cndmask_b32_e32 v6, v57, v56, vcc
38505; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
38506; GCN-NEXT:    v_cndmask_b32_e32 v5, v47, v46, vcc
38507; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
38508; GCN-NEXT:    v_cndmask_b32_e32 v4, v48, v39, vcc
38509; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
38510; GCN-NEXT:    v_cndmask_b32_e32 v3, v45, v44, vcc
38511; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
38512; GCN-NEXT:    v_cndmask_b32_e32 v2, v43, v42, vcc
38513; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
38514; GCN-NEXT:    v_cndmask_b32_e32 v1, v41, v55, vcc
38515; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
38516; GCN-NEXT:    v_cndmask_b32_e32 v0, v54, v53, vcc
38517; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v26
38518; GCN-NEXT:    v_cndmask_b32_e32 v31, v40, v37, vcc
38519; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
38520; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
38521; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
38522; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
38523; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
38524; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
38525; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
38526; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
38527; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
38528; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
38529; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
38530; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
38531; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
38532; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
38533; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
38534; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
38535; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
38536; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
38537; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
38538; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
38539; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
38540; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
38541; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
38542; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
38543; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
38544; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
38545; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
38546; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
38547; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
38548; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
38549; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
38550; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
38551; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
38552; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
38553; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
38554; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
38555; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
38556; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
38557; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
38558; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
38559; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
38560; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
38561; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
38562; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
38563; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
38564; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
38565; GCN-NEXT:    s_waitcnt vmcnt(0)
38566; GCN-NEXT:    s_setpc_b64 s[30:31]
38567;
38568; GFX7-LABEL: v_vselect_v32bf16:
38569; GFX7:       ; %bb.0:
38570; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38571; GFX7-NEXT:    v_and_b32_e32 v24, 1, v24
38572; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v24
38573; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32
38574; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:228
38575; GFX7-NEXT:    v_and_b32_e32 v25, 1, v25
38576; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v25
38577; GFX7-NEXT:    v_and_b32_e32 v30, 1, v30
38578; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v30
38579; GFX7-NEXT:    v_and_b32_e32 v29, 1, v29
38580; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v29
38581; GFX7-NEXT:    v_and_b32_e32 v28, 1, v28
38582; GFX7-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v28
38583; GFX7-NEXT:    v_and_b32_e32 v27, 1, v27
38584; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v27
38585; GFX7-NEXT:    v_and_b32_e32 v26, 1, v26
38586; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v26
38587; GFX7-NEXT:    v_and_b32_e32 v23, 1, v23
38588; GFX7-NEXT:    v_and_b32_e32 v22, 1, v22
38589; GFX7-NEXT:    v_and_b32_e32 v21, 1, v21
38590; GFX7-NEXT:    v_and_b32_e32 v20, 1, v20
38591; GFX7-NEXT:    v_and_b32_e32 v19, 1, v19
38592; GFX7-NEXT:    v_and_b32_e32 v18, 1, v18
38593; GFX7-NEXT:    v_and_b32_e32 v17, 1, v17
38594; GFX7-NEXT:    v_and_b32_e32 v16, 1, v16
38595; GFX7-NEXT:    v_and_b32_e32 v15, 1, v15
38596; GFX7-NEXT:    v_and_b32_e32 v14, 1, v14
38597; GFX7-NEXT:    v_and_b32_e32 v13, 1, v13
38598; GFX7-NEXT:    v_and_b32_e32 v12, 1, v12
38599; GFX7-NEXT:    v_and_b32_e32 v11, 1, v11
38600; GFX7-NEXT:    v_and_b32_e32 v10, 1, v10
38601; GFX7-NEXT:    v_and_b32_e32 v9, 1, v9
38602; GFX7-NEXT:    v_and_b32_e32 v8, 1, v8
38603; GFX7-NEXT:    v_and_b32_e32 v7, 1, v7
38604; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
38605; GFX7-NEXT:    v_and_b32_e32 v5, 1, v5
38606; GFX7-NEXT:    v_and_b32_e32 v4, 1, v4
38607; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
38608; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
38609; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
38610; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
38611; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:252
38612; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:256
38613; GFX7-NEXT:    s_waitcnt vmcnt(3)
38614; GFX7-NEXT:    v_and_b32_e32 v24, 1, v24
38615; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
38616; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:124
38617; GFX7-NEXT:    s_waitcnt vmcnt(3)
38618; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38619; GFX7-NEXT:    s_waitcnt vmcnt(2)
38620; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38621; GFX7-NEXT:    s_waitcnt vmcnt(1)
38622; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
38623; GFX7-NEXT:    s_waitcnt vmcnt(0)
38624; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38625; GFX7-NEXT:    v_cndmask_b32_e64 v30, v25, v24, s[12:13]
38626; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:120
38627; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:248
38628; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
38629; GFX7-NEXT:    s_waitcnt vmcnt(1)
38630; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38631; GFX7-NEXT:    s_waitcnt vmcnt(0)
38632; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38633; GFX7-NEXT:    v_cndmask_b32_e64 v29, v25, v24, s[14:15]
38634; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
38635; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:244
38636; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
38637; GFX7-NEXT:    s_waitcnt vmcnt(1)
38638; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38639; GFX7-NEXT:    s_waitcnt vmcnt(0)
38640; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38641; GFX7-NEXT:    v_cndmask_b32_e64 v28, v25, v24, s[16:17]
38642; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:112
38643; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:240
38644; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
38645; GFX7-NEXT:    s_waitcnt vmcnt(1)
38646; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38647; GFX7-NEXT:    s_waitcnt vmcnt(0)
38648; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38649; GFX7-NEXT:    v_cndmask_b32_e64 v27, v25, v24, s[10:11]
38650; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:108
38651; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:236
38652; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
38653; GFX7-NEXT:    s_waitcnt vmcnt(1)
38654; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38655; GFX7-NEXT:    s_waitcnt vmcnt(0)
38656; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38657; GFX7-NEXT:    v_cndmask_b32_e64 v26, v25, v24, s[8:9]
38658; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:104
38659; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:232
38660; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
38661; GFX7-NEXT:    s_waitcnt vmcnt(1)
38662; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38663; GFX7-NEXT:    s_waitcnt vmcnt(0)
38664; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
38665; GFX7-NEXT:    v_cndmask_b32_e64 v25, v25, v24, s[6:7]
38666; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:128
38667; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
38668; GFX7-NEXT:    s_waitcnt vmcnt(0)
38669; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38670; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v24, s[4:5]
38671; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
38672; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
38673; GFX7-NEXT:    s_waitcnt vmcnt(0)
38674; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
38675; GFX7-NEXT:    v_cndmask_b32_e32 v24, v32, v24, vcc
38676; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v23
38677; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
38678; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
38679; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
38680; GFX7-NEXT:    s_waitcnt vmcnt(1)
38681; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
38682; GFX7-NEXT:    s_waitcnt vmcnt(0)
38683; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38684; GFX7-NEXT:    v_cndmask_b32_e32 v23, v32, v23, vcc
38685; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v22
38686; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
38687; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:220
38688; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
38689; GFX7-NEXT:    s_waitcnt vmcnt(1)
38690; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
38691; GFX7-NEXT:    s_waitcnt vmcnt(0)
38692; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38693; GFX7-NEXT:    v_cndmask_b32_e32 v22, v32, v22, vcc
38694; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
38695; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
38696; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216
38697; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
38698; GFX7-NEXT:    s_waitcnt vmcnt(1)
38699; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
38700; GFX7-NEXT:    s_waitcnt vmcnt(0)
38701; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38702; GFX7-NEXT:    v_cndmask_b32_e32 v21, v32, v21, vcc
38703; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
38704; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
38705; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:212
38706; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
38707; GFX7-NEXT:    s_waitcnt vmcnt(1)
38708; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
38709; GFX7-NEXT:    s_waitcnt vmcnt(0)
38710; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38711; GFX7-NEXT:    v_cndmask_b32_e32 v20, v32, v20, vcc
38712; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
38713; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
38714; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:208
38715; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
38716; GFX7-NEXT:    s_waitcnt vmcnt(1)
38717; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
38718; GFX7-NEXT:    s_waitcnt vmcnt(0)
38719; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38720; GFX7-NEXT:    v_cndmask_b32_e32 v19, v32, v19, vcc
38721; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
38722; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
38723; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204
38724; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
38725; GFX7-NEXT:    s_waitcnt vmcnt(1)
38726; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
38727; GFX7-NEXT:    s_waitcnt vmcnt(0)
38728; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38729; GFX7-NEXT:    v_cndmask_b32_e32 v18, v32, v18, vcc
38730; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
38731; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
38732; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:200
38733; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
38734; GFX7-NEXT:    s_waitcnt vmcnt(1)
38735; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
38736; GFX7-NEXT:    s_waitcnt vmcnt(0)
38737; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38738; GFX7-NEXT:    v_cndmask_b32_e32 v17, v32, v17, vcc
38739; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
38740; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:68
38741; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:196
38742; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
38743; GFX7-NEXT:    s_waitcnt vmcnt(1)
38744; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
38745; GFX7-NEXT:    s_waitcnt vmcnt(0)
38746; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38747; GFX7-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
38748; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
38749; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
38750; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:192
38751; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
38752; GFX7-NEXT:    s_waitcnt vmcnt(1)
38753; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
38754; GFX7-NEXT:    s_waitcnt vmcnt(0)
38755; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38756; GFX7-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
38757; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
38758; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
38759; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:188
38760; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
38761; GFX7-NEXT:    s_waitcnt vmcnt(1)
38762; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
38763; GFX7-NEXT:    s_waitcnt vmcnt(0)
38764; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38765; GFX7-NEXT:    v_cndmask_b32_e32 v14, v32, v14, vcc
38766; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
38767; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
38768; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:184
38769; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
38770; GFX7-NEXT:    s_waitcnt vmcnt(1)
38771; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
38772; GFX7-NEXT:    s_waitcnt vmcnt(0)
38773; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38774; GFX7-NEXT:    v_cndmask_b32_e32 v13, v32, v13, vcc
38775; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
38776; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
38777; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:180
38778; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
38779; GFX7-NEXT:    s_waitcnt vmcnt(1)
38780; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
38781; GFX7-NEXT:    s_waitcnt vmcnt(0)
38782; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38783; GFX7-NEXT:    v_cndmask_b32_e32 v12, v32, v12, vcc
38784; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
38785; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
38786; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:176
38787; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
38788; GFX7-NEXT:    s_waitcnt vmcnt(1)
38789; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
38790; GFX7-NEXT:    s_waitcnt vmcnt(0)
38791; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38792; GFX7-NEXT:    v_cndmask_b32_e32 v11, v32, v11, vcc
38793; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
38794; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
38795; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:172
38796; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
38797; GFX7-NEXT:    s_waitcnt vmcnt(1)
38798; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
38799; GFX7-NEXT:    s_waitcnt vmcnt(0)
38800; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38801; GFX7-NEXT:    v_cndmask_b32_e32 v10, v32, v10, vcc
38802; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
38803; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
38804; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:168
38805; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
38806; GFX7-NEXT:    s_waitcnt vmcnt(1)
38807; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
38808; GFX7-NEXT:    s_waitcnt vmcnt(0)
38809; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38810; GFX7-NEXT:    v_cndmask_b32_e32 v9, v32, v9, vcc
38811; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
38812; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
38813; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:164
38814; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
38815; GFX7-NEXT:    s_waitcnt vmcnt(1)
38816; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
38817; GFX7-NEXT:    s_waitcnt vmcnt(0)
38818; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38819; GFX7-NEXT:    v_cndmask_b32_e32 v8, v32, v8, vcc
38820; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
38821; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
38822; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:160
38823; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
38824; GFX7-NEXT:    s_waitcnt vmcnt(1)
38825; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
38826; GFX7-NEXT:    s_waitcnt vmcnt(0)
38827; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38828; GFX7-NEXT:    v_cndmask_b32_e32 v7, v32, v7, vcc
38829; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
38830; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
38831; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
38832; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
38833; GFX7-NEXT:    s_waitcnt vmcnt(1)
38834; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
38835; GFX7-NEXT:    s_waitcnt vmcnt(0)
38836; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38837; GFX7-NEXT:    v_cndmask_b32_e32 v6, v32, v6, vcc
38838; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
38839; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
38840; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:152
38841; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
38842; GFX7-NEXT:    s_waitcnt vmcnt(1)
38843; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
38844; GFX7-NEXT:    s_waitcnt vmcnt(0)
38845; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38846; GFX7-NEXT:    v_cndmask_b32_e32 v5, v32, v5, vcc
38847; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
38848; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
38849; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148
38850; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
38851; GFX7-NEXT:    s_waitcnt vmcnt(1)
38852; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
38853; GFX7-NEXT:    s_waitcnt vmcnt(0)
38854; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38855; GFX7-NEXT:    v_cndmask_b32_e32 v4, v32, v4, vcc
38856; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
38857; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
38858; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
38859; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
38860; GFX7-NEXT:    s_waitcnt vmcnt(1)
38861; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
38862; GFX7-NEXT:    s_waitcnt vmcnt(0)
38863; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38864; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
38865; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
38866; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
38867; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:140
38868; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
38869; GFX7-NEXT:    s_waitcnt vmcnt(1)
38870; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
38871; GFX7-NEXT:    s_waitcnt vmcnt(0)
38872; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38873; GFX7-NEXT:    v_cndmask_b32_e32 v2, v32, v2, vcc
38874; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
38875; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
38876; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:136
38877; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
38878; GFX7-NEXT:    s_waitcnt vmcnt(1)
38879; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
38880; GFX7-NEXT:    s_waitcnt vmcnt(0)
38881; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38882; GFX7-NEXT:    v_cndmask_b32_e32 v1, v32, v1, vcc
38883; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
38884; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
38885; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
38886; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
38887; GFX7-NEXT:    s_waitcnt vmcnt(1)
38888; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
38889; GFX7-NEXT:    s_waitcnt vmcnt(0)
38890; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
38891; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
38892; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
38893; GFX7-NEXT:    s_setpc_b64 s[30:31]
38894;
38895; GFX8-LABEL: v_vselect_v32bf16:
38896; GFX8:       ; %bb.0:
38897; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38898; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
38899; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
38900; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
38901; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
38902; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
38903; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
38904; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
38905; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
38906; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
38907; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
38908; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
38909; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
38910; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
38911; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
38912; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
38913; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
38914; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
38915; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
38916; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
38917; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
38918; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
38919; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
38920; GFX8-NEXT:    v_writelane_b32 v34, s40, 8
38921; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
38922; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
38923; GFX8-NEXT:    v_writelane_b32 v34, s41, 9
38924; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
38925; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
38926; GFX8-NEXT:    v_writelane_b32 v34, s42, 10
38927; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
38928; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
38929; GFX8-NEXT:    v_writelane_b32 v34, s43, 11
38930; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
38931; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
38932; GFX8-NEXT:    v_writelane_b32 v34, s44, 12
38933; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
38934; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
38935; GFX8-NEXT:    v_writelane_b32 v34, s45, 13
38936; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
38937; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
38938; GFX8-NEXT:    v_writelane_b32 v34, s46, 14
38939; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
38940; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
38941; GFX8-NEXT:    v_writelane_b32 v34, s47, 15
38942; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
38943; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
38944; GFX8-NEXT:    v_writelane_b32 v34, s48, 16
38945; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
38946; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
38947; GFX8-NEXT:    v_writelane_b32 v34, s49, 17
38948; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
38949; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
38950; GFX8-NEXT:    v_writelane_b32 v34, s50, 18
38951; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
38952; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
38953; GFX8-NEXT:    v_writelane_b32 v34, s51, 19
38954; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
38955; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
38956; GFX8-NEXT:    v_writelane_b32 v34, s52, 20
38957; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
38958; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
38959; GFX8-NEXT:    v_writelane_b32 v34, s53, 21
38960; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
38961; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
38962; GFX8-NEXT:    v_writelane_b32 v34, s54, 22
38963; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
38964; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
38965; GFX8-NEXT:    v_writelane_b32 v34, s55, 23
38966; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
38967; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
38968; GFX8-NEXT:    v_writelane_b32 v34, s56, 24
38969; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
38970; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
38971; GFX8-NEXT:    v_writelane_b32 v34, s57, 25
38972; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
38973; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
38974; GFX8-NEXT:    v_writelane_b32 v34, s58, 26
38975; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
38976; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
38977; GFX8-NEXT:    v_writelane_b32 v34, s59, 27
38978; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
38979; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
38980; GFX8-NEXT:    v_writelane_b32 v34, s60, 28
38981; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
38982; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
38983; GFX8-NEXT:    v_writelane_b32 v34, s61, 29
38984; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
38985; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
38986; GFX8-NEXT:    v_writelane_b32 v34, s62, 30
38987; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
38988; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
38989; GFX8-NEXT:    v_writelane_b32 v34, s63, 31
38990; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
38991; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
38992; GFX8-NEXT:    v_writelane_b32 v34, s64, 32
38993; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
38994; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
38995; GFX8-NEXT:    v_writelane_b32 v34, s65, 33
38996; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
38997; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
38998; GFX8-NEXT:    v_writelane_b32 v34, s66, 34
38999; GFX8-NEXT:    v_writelane_b32 v34, s67, 35
39000; GFX8-NEXT:    s_waitcnt vmcnt(0)
39001; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
39002; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
39003; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
39004; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
39005; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
39006; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
39007; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
39008; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
39009; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
39010; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
39011; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
39012; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
39013; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
39014; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
39015; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
39016; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
39017; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
39018; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
39019; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
39020; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
39021; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
39022; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
39023; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
39024; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
39025; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
39026; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
39027; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
39028; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
39029; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
39030; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
39031; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
39032; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
39033; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:128
39034; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
39035; GFX8-NEXT:    s_waitcnt vmcnt(1)
39036; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
39037; GFX8-NEXT:    s_waitcnt vmcnt(0)
39038; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v32
39039; GFX8-NEXT:    v_cndmask_b32_e64 v28, v33, v28, s[66:67]
39040; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[64:65]
39041; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
39042; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
39043; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v32, s[62:63]
39044; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s[60:61]
39045; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
39046; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
39047; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[58:59]
39048; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39049; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
39050; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
39051; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[54:55]
39052; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39053; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
39054; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
39055; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[50:51]
39056; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39057; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
39058; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
39059; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[46:47]
39060; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39061; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
39062; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
39063; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[42:43]
39064; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39065; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
39066; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
39067; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[38:39]
39068; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39069; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
39070; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
39071; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[34:35]
39072; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39073; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
39074; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
39075; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[28:29]
39076; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39077; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
39078; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
39079; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[24:25]
39080; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39081; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
39082; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
39083; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[20:21]
39084; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39085; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
39086; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
39087; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[16:17]
39088; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39089; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
39090; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
39091; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[12:13]
39092; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39093; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
39094; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
39095; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[8:9]
39096; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39097; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
39098; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
39099; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[4:5]
39100; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
39101; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
39102; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39103; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
39104; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39105; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
39106; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
39107; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39108; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39109; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
39110; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
39111; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
39112; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
39113; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39114; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39115; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39116; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39117; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v19
39118; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v21
39119; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
39120; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
39121; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
39122; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
39123; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
39124; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
39125; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39126; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39127; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39128; GFX8-NEXT:    v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39129; GFX8-NEXT:    v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39130; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39131; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39132; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39133; GFX8-NEXT:    v_readlane_b32 s67, v34, 35
39134; GFX8-NEXT:    v_readlane_b32 s66, v34, 34
39135; GFX8-NEXT:    v_readlane_b32 s65, v34, 33
39136; GFX8-NEXT:    v_readlane_b32 s64, v34, 32
39137; GFX8-NEXT:    v_readlane_b32 s63, v34, 31
39138; GFX8-NEXT:    v_readlane_b32 s62, v34, 30
39139; GFX8-NEXT:    v_readlane_b32 s61, v34, 29
39140; GFX8-NEXT:    v_readlane_b32 s60, v34, 28
39141; GFX8-NEXT:    v_readlane_b32 s59, v34, 27
39142; GFX8-NEXT:    v_readlane_b32 s58, v34, 26
39143; GFX8-NEXT:    v_readlane_b32 s57, v34, 25
39144; GFX8-NEXT:    v_readlane_b32 s56, v34, 24
39145; GFX8-NEXT:    v_readlane_b32 s55, v34, 23
39146; GFX8-NEXT:    v_readlane_b32 s54, v34, 22
39147; GFX8-NEXT:    v_readlane_b32 s53, v34, 21
39148; GFX8-NEXT:    v_readlane_b32 s52, v34, 20
39149; GFX8-NEXT:    v_readlane_b32 s51, v34, 19
39150; GFX8-NEXT:    v_readlane_b32 s50, v34, 18
39151; GFX8-NEXT:    v_readlane_b32 s49, v34, 17
39152; GFX8-NEXT:    v_readlane_b32 s48, v34, 16
39153; GFX8-NEXT:    v_readlane_b32 s47, v34, 15
39154; GFX8-NEXT:    v_readlane_b32 s46, v34, 14
39155; GFX8-NEXT:    v_readlane_b32 s45, v34, 13
39156; GFX8-NEXT:    v_readlane_b32 s44, v34, 12
39157; GFX8-NEXT:    v_readlane_b32 s43, v34, 11
39158; GFX8-NEXT:    v_readlane_b32 s42, v34, 10
39159; GFX8-NEXT:    v_readlane_b32 s41, v34, 9
39160; GFX8-NEXT:    v_readlane_b32 s40, v34, 8
39161; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
39162; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
39163; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
39164; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
39165; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
39166; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
39167; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
39168; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
39169; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
39170; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39171; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
39172; GFX8-NEXT:    s_waitcnt vmcnt(0)
39173; GFX8-NEXT:    s_setpc_b64 s[30:31]
39174;
39175; GFX9-LABEL: v_vselect_v32bf16:
39176; GFX9:       ; %bb.0:
39177; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39178; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
39179; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
39180; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
39181; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
39182; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
39183; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
39184; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
39185; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
39186; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
39187; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
39188; GFX9-NEXT:    v_writelane_b32 v33, s36, 4
39189; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
39190; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
39191; GFX9-NEXT:    v_writelane_b32 v33, s37, 5
39192; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
39193; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
39194; GFX9-NEXT:    v_writelane_b32 v33, s38, 6
39195; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
39196; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
39197; GFX9-NEXT:    v_writelane_b32 v33, s39, 7
39198; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
39199; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
39200; GFX9-NEXT:    v_writelane_b32 v33, s40, 8
39201; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
39202; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
39203; GFX9-NEXT:    v_writelane_b32 v33, s41, 9
39204; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
39205; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
39206; GFX9-NEXT:    v_writelane_b32 v33, s42, 10
39207; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
39208; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
39209; GFX9-NEXT:    v_writelane_b32 v33, s43, 11
39210; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
39211; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
39212; GFX9-NEXT:    v_writelane_b32 v33, s44, 12
39213; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
39214; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
39215; GFX9-NEXT:    v_writelane_b32 v33, s45, 13
39216; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
39217; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
39218; GFX9-NEXT:    v_writelane_b32 v33, s46, 14
39219; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
39220; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
39221; GFX9-NEXT:    v_writelane_b32 v33, s47, 15
39222; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
39223; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
39224; GFX9-NEXT:    v_writelane_b32 v33, s48, 16
39225; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
39226; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
39227; GFX9-NEXT:    v_writelane_b32 v33, s49, 17
39228; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
39229; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
39230; GFX9-NEXT:    v_writelane_b32 v33, s50, 18
39231; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
39232; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
39233; GFX9-NEXT:    v_writelane_b32 v33, s51, 19
39234; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
39235; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
39236; GFX9-NEXT:    v_writelane_b32 v33, s52, 20
39237; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
39238; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
39239; GFX9-NEXT:    v_writelane_b32 v33, s53, 21
39240; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
39241; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
39242; GFX9-NEXT:    v_writelane_b32 v33, s54, 22
39243; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
39244; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
39245; GFX9-NEXT:    v_writelane_b32 v33, s55, 23
39246; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
39247; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
39248; GFX9-NEXT:    v_writelane_b32 v33, s56, 24
39249; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
39250; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
39251; GFX9-NEXT:    v_writelane_b32 v33, s57, 25
39252; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
39253; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
39254; GFX9-NEXT:    v_writelane_b32 v33, s58, 26
39255; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
39256; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
39257; GFX9-NEXT:    v_writelane_b32 v33, s59, 27
39258; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
39259; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
39260; GFX9-NEXT:    v_writelane_b32 v33, s60, 28
39261; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
39262; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
39263; GFX9-NEXT:    v_writelane_b32 v33, s61, 29
39264; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
39265; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
39266; GFX9-NEXT:    v_writelane_b32 v33, s62, 30
39267; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
39268; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
39269; GFX9-NEXT:    v_writelane_b32 v33, s63, 31
39270; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
39271; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
39272; GFX9-NEXT:    v_writelane_b32 v33, s64, 32
39273; GFX9-NEXT:    v_writelane_b32 v33, s65, 33
39274; GFX9-NEXT:    v_writelane_b32 v33, s66, 34
39275; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
39276; GFX9-NEXT:    v_writelane_b32 v33, s67, 35
39277; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
39278; GFX9-NEXT:    s_waitcnt vmcnt(0)
39279; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
39280; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
39281; GFX9-NEXT:    v_and_b32_e32 v0, 1, v30
39282; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
39283; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
39284; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
39285; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
39286; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
39287; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
39288; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
39289; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
39290; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
39291; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
39292; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
39293; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
39294; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
39295; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
39296; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
39297; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
39298; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
39299; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
39300; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
39301; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
39302; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
39303; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
39304; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
39305; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
39306; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
39307; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
39308; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
39309; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
39310; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
39311; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
39312; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
39313; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
39314; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
39315; GFX9-NEXT:    s_waitcnt vmcnt(0)
39316; GFX9-NEXT:    v_cndmask_b32_e64 v29, v31, v32, s[66:67]
39317; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
39318; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
39319; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[64:65]
39320; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v30, s[62:63]
39321; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
39322; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
39323; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
39324; GFX9-NEXT:    v_cndmask_b32_e64 v30, v26, v27, s[58:59]
39325; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
39326; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
39327; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39328; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[54:55]
39329; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
39330; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
39331; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39332; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[50:51]
39333; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
39334; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
39335; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39336; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[46:47]
39337; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
39338; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
39339; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39340; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[42:43]
39341; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
39342; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
39343; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39344; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[38:39]
39345; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
39346; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
39347; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39348; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[34:35]
39349; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
39350; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
39351; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39352; GFX9-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
39353; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
39354; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
39355; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39356; GFX9-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
39357; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
39358; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
39359; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39360; GFX9-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
39361; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
39362; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
39363; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39364; GFX9-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
39365; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
39366; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
39367; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39368; GFX9-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
39369; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
39370; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
39371; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39372; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
39373; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
39374; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
39375; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39376; GFX9-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
39377; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
39378; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
39379; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
39380; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
39381; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
39382; GFX9-NEXT:    v_perm_b32 v1, v2, v5, s4
39383; GFX9-NEXT:    v_perm_b32 v2, v4, v7, s4
39384; GFX9-NEXT:    v_perm_b32 v3, v6, v9, s4
39385; GFX9-NEXT:    v_perm_b32 v4, v8, v11, s4
39386; GFX9-NEXT:    v_perm_b32 v5, v10, v13, s4
39387; GFX9-NEXT:    v_perm_b32 v6, v12, v15, s4
39388; GFX9-NEXT:    v_perm_b32 v7, v14, v17, s4
39389; GFX9-NEXT:    v_perm_b32 v8, v16, v19, s4
39390; GFX9-NEXT:    v_perm_b32 v9, v18, v21, s4
39391; GFX9-NEXT:    v_perm_b32 v10, v20, v23, s4
39392; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
39393; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
39394; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
39395; GFX9-NEXT:    v_perm_b32 v14, v28, v32, s4
39396; GFX9-NEXT:    v_perm_b32 v15, v31, v29, s4
39397; GFX9-NEXT:    v_readlane_b32 s67, v33, 35
39398; GFX9-NEXT:    v_readlane_b32 s66, v33, 34
39399; GFX9-NEXT:    v_readlane_b32 s65, v33, 33
39400; GFX9-NEXT:    v_readlane_b32 s64, v33, 32
39401; GFX9-NEXT:    v_readlane_b32 s63, v33, 31
39402; GFX9-NEXT:    v_readlane_b32 s62, v33, 30
39403; GFX9-NEXT:    v_readlane_b32 s61, v33, 29
39404; GFX9-NEXT:    v_readlane_b32 s60, v33, 28
39405; GFX9-NEXT:    v_readlane_b32 s59, v33, 27
39406; GFX9-NEXT:    v_readlane_b32 s58, v33, 26
39407; GFX9-NEXT:    v_readlane_b32 s57, v33, 25
39408; GFX9-NEXT:    v_readlane_b32 s56, v33, 24
39409; GFX9-NEXT:    v_readlane_b32 s55, v33, 23
39410; GFX9-NEXT:    v_readlane_b32 s54, v33, 22
39411; GFX9-NEXT:    v_readlane_b32 s53, v33, 21
39412; GFX9-NEXT:    v_readlane_b32 s52, v33, 20
39413; GFX9-NEXT:    v_readlane_b32 s51, v33, 19
39414; GFX9-NEXT:    v_readlane_b32 s50, v33, 18
39415; GFX9-NEXT:    v_readlane_b32 s49, v33, 17
39416; GFX9-NEXT:    v_readlane_b32 s48, v33, 16
39417; GFX9-NEXT:    v_readlane_b32 s47, v33, 15
39418; GFX9-NEXT:    v_readlane_b32 s46, v33, 14
39419; GFX9-NEXT:    v_readlane_b32 s45, v33, 13
39420; GFX9-NEXT:    v_readlane_b32 s44, v33, 12
39421; GFX9-NEXT:    v_readlane_b32 s43, v33, 11
39422; GFX9-NEXT:    v_readlane_b32 s42, v33, 10
39423; GFX9-NEXT:    v_readlane_b32 s41, v33, 9
39424; GFX9-NEXT:    v_readlane_b32 s40, v33, 8
39425; GFX9-NEXT:    v_readlane_b32 s39, v33, 7
39426; GFX9-NEXT:    v_readlane_b32 s38, v33, 6
39427; GFX9-NEXT:    v_readlane_b32 s37, v33, 5
39428; GFX9-NEXT:    v_readlane_b32 s36, v33, 4
39429; GFX9-NEXT:    v_readlane_b32 s35, v33, 3
39430; GFX9-NEXT:    v_readlane_b32 s34, v33, 2
39431; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
39432; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
39433; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
39434; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39435; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
39436; GFX9-NEXT:    s_waitcnt vmcnt(0)
39437; GFX9-NEXT:    s_setpc_b64 s[30:31]
39438;
39439; GFX10-LABEL: v_vselect_v32bf16:
39440; GFX10:       ; %bb.0:
39441; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39442; GFX10-NEXT:    s_clause 0xa
39443; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
39444; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
39445; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
39446; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
39447; GFX10-NEXT:    buffer_load_ushort v35, off, s[0:3], s32
39448; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:128
39449; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64
39450; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96
39451; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:108
39452; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
39453; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
39454; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
39455; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
39456; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
39457; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
39458; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
39459; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
39460; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v18
39461; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
39462; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v13
39463; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v19
39464; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
39465; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
39466; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
39467; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
39468; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
39469; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
39470; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
39471; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
39472; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
39473; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
39474; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
39475; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
39476; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
39477; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
39478; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
39479; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
39480; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
39481; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
39482; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
39483; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
39484; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
39485; GFX10-NEXT:    s_waitcnt vmcnt(10)
39486; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v31
39487; GFX10-NEXT:    s_waitcnt vmcnt(9)
39488; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
39489; GFX10-NEXT:    s_waitcnt vmcnt(8)
39490; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
39491; GFX10-NEXT:    s_waitcnt vmcnt(7)
39492; GFX10-NEXT:    v_cndmask_b32_e64 v18, v34, v33, s6
39493; GFX10-NEXT:    s_waitcnt vmcnt(6)
39494; GFX10-NEXT:    v_and_b32_e32 v35, 1, v35
39495; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v12
39496; GFX10-NEXT:    s_waitcnt vmcnt(4)
39497; GFX10-NEXT:    v_cndmask_b32_e32 v54, v36, v37, vcc_lo
39498; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39499; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39500; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v35
39501; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v34
39502; GFX10-NEXT:    v_cndmask_b32_e64 v12, v32, v31, s6
39503; GFX10-NEXT:    s_clause 0x6
39504; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68
39505; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
39506; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
39507; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
39508; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:76
39509; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
39510; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:80
39511; GFX10-NEXT:    v_cndmask_b32_e64 v30, v50, v30, s4
39512; GFX10-NEXT:    v_cndmask_b32_e32 v35, v36, v37, vcc_lo
39513; GFX10-NEXT:    s_clause 0x1
39514; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:124
39515; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
39516; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
39517; GFX10-NEXT:    v_and_b32_e32 v28, 1, v29
39518; GFX10-NEXT:    v_cndmask_b32_e64 v13, v51, v13, s5
39519; GFX10-NEXT:    s_waitcnt vmcnt(3)
39520; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v52
39521; GFX10-NEXT:    s_waitcnt vmcnt(0)
39522; GFX10-NEXT:    v_cndmask_b32_e32 v29, v36, v37, vcc_lo
39523; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39524; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39525; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
39526; GFX10-NEXT:    v_cndmask_b32_e32 v28, v36, v37, vcc_lo
39527; GFX10-NEXT:    s_clause 0x1
39528; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:120
39529; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
39530; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
39531; GFX10-NEXT:    v_and_b32_e32 v26, 1, v27
39532; GFX10-NEXT:    s_waitcnt vmcnt(0)
39533; GFX10-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
39534; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39535; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39536; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
39537; GFX10-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
39538; GFX10-NEXT:    s_clause 0x1
39539; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116
39540; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52
39541; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
39542; GFX10-NEXT:    v_and_b32_e32 v24, 1, v25
39543; GFX10-NEXT:    s_waitcnt vmcnt(0)
39544; GFX10-NEXT:    v_cndmask_b32_e32 v25, v36, v37, vcc_lo
39545; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39546; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39547; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
39548; GFX10-NEXT:    v_cndmask_b32_e32 v24, v36, v37, vcc_lo
39549; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
39550; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
39551; GFX10-NEXT:    v_and_b32_e32 v22, 1, v23
39552; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v49
39553; GFX10-NEXT:    s_waitcnt vmcnt(0)
39554; GFX10-NEXT:    v_cndmask_b32_e32 v23, v49, v36, vcc_lo
39555; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39556; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
39557; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v53
39558; GFX10-NEXT:    v_cndmask_b32_e32 v22, v37, v36, vcc_lo
39559; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
39560; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v48
39561; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v39
39562; GFX10-NEXT:    v_cndmask_b32_e32 v20, v39, v48, vcc_lo
39563; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
39564; GFX10-NEXT:    s_clause 0x1
39565; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
39566; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
39567; GFX10-NEXT:    v_cndmask_b32_e32 v21, v37, v36, vcc_lo
39568; GFX10-NEXT:    s_clause 0x1
39569; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:100
39570; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
39571; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
39572; GFX10-NEXT:    s_waitcnt vmcnt(0)
39573; GFX10-NEXT:    v_cndmask_b32_e32 v16, v36, v37, vcc_lo
39574; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
39575; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39576; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39577; GFX10-NEXT:    v_cndmask_b32_e32 v14, v38, v39, vcc_lo
39578; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
39579; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
39580; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
39581; GFX10-NEXT:    v_cndmask_b32_e32 v17, v36, v37, vcc_lo
39582; GFX10-NEXT:    s_clause 0x1
39583; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:88
39584; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
39585; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
39586; GFX10-NEXT:    v_cndmask_b32_e32 v15, v38, v39, vcc_lo
39587; GFX10-NEXT:    s_clause 0x1
39588; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:84
39589; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
39590; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
39591; GFX10-NEXT:    s_waitcnt vmcnt(2)
39592; GFX10-NEXT:    v_cndmask_b32_e32 v10, v36, v37, vcc_lo
39593; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
39594; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39595; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39596; GFX10-NEXT:    s_waitcnt vmcnt(0)
39597; GFX10-NEXT:    v_cndmask_b32_e32 v8, v38, v39, vcc_lo
39598; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
39599; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
39600; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
39601; GFX10-NEXT:    v_cndmask_b32_e32 v6, v53, v48, vcc_lo
39602; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
39603; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
39604; GFX10-NEXT:    v_cndmask_b32_e32 v4, v34, v52, vcc_lo
39605; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
39606; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
39607; GFX10-NEXT:    v_cndmask_b32_e32 v2, v32, v33, vcc_lo
39608; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
39609; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
39610; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
39611; GFX10-NEXT:    v_cndmask_b32_e32 v0, v19, v31, vcc_lo
39612; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
39613; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
39614; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
39615; GFX10-NEXT:    v_cndmask_b32_e32 v11, v36, v37, vcc_lo
39616; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
39617; GFX10-NEXT:    v_cndmask_b32_e32 v7, v49, v48, vcc_lo
39618; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
39619; GFX10-NEXT:    v_cndmask_b32_e32 v3, v32, v33, vcc_lo
39620; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
39621; GFX10-NEXT:    v_cndmask_b32_e32 v1, v19, v31, vcc_lo
39622; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
39623; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
39624; GFX10-NEXT:    v_cndmask_b32_e32 v5, v34, v50, vcc_lo
39625; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
39626; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
39627; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
39628; GFX10-NEXT:    v_perm_b32 v6, v30, v12, 0x5040100
39629; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
39630; GFX10-NEXT:    v_cndmask_b32_e32 v9, v38, v39, vcc_lo
39631; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
39632; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
39633; GFX10-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
39634; GFX10-NEXT:    v_perm_b32 v11, v22, v23, 0x5040100
39635; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
39636; GFX10-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
39637; GFX10-NEXT:    v_perm_b32 v9, v13, v18, 0x5040100
39638; GFX10-NEXT:    v_perm_b32 v12, v24, v25, 0x5040100
39639; GFX10-NEXT:    v_perm_b32 v13, v26, v27, 0x5040100
39640; GFX10-NEXT:    v_perm_b32 v14, v28, v29, 0x5040100
39641; GFX10-NEXT:    v_perm_b32 v15, v35, v54, 0x5040100
39642; GFX10-NEXT:    s_setpc_b64 s[30:31]
39643;
39644; GFX11TRUE16-LABEL: v_vselect_v32bf16:
39645; GFX11TRUE16:       ; %bb.0:
39646; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39647; GFX11TRUE16-NEXT:    s_clause 0x1f
39648; GFX11TRUE16-NEXT:    scratch_load_u16 v31, off, s32
39649; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:128
39650; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:64
39651; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
39652; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:60
39653; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:120
39654; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:56
39655; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:116
39656; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:52
39657; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:112
39658; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:48
39659; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:108
39660; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:44
39661; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:104
39662; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
39663; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
39664; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
39665; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:96
39666; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:32
39667; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:92
39668; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:28
39669; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:88
39670; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:24
39671; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:84
39672; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:20
39673; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:80
39674; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:16
39675; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
39676; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:12
39677; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:72
39678; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
39679; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
39680; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
39681; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
39682; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
39683; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
39684; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
39685; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
39686; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
39687; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
39688; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
39689; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
39690; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
39691; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
39692; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
39693; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
39694; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
39695; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
39696; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
39697; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
39698; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
39699; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
39700; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
39701; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
39702; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
39703; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
39704; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
39705; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
39706; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
39707; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
39708; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
39709; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
39710; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
39711; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
39712; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
39713; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
39714; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
39715; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
39716; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v26
39717; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v28
39718; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
39719; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
39720; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
39721; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
39722; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
39723; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
39724; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
39725; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
39726; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
39727; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
39728; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
39729; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
39730; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
39731; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
39732; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
39733; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
39734; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
39735; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
39736; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
39737; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
39738; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
39739; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
39740; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
39741; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
39742; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
39743; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
39744; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v31
39745; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
39746; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
39747; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
39748; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v32.l, v33.l, s26
39749; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v33
39750; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
39751; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v35.l, s29
39752; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
39753; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
39754; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
39755; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.l, v37.l, s27
39756; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v37
39757; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v36
39758; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
39759; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v38.l, v39.l, s24
39760; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v39
39761; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v38
39762; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
39763; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v48.l, v49.l, s22
39764; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
39765; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v48
39766; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
39767; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
39768; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v52
39769; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
39770; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v65
39771; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v64
39772; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v50.l, v51.l, s20
39773; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
39774; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v68
39775; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
39776; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v69
39777; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
39778; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v70
39779; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
39780; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v71
39781; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
39782; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v80
39783; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(6)
39784; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v81
39785; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
39786; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v82
39787; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
39788; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v83
39789; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
39790; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v84
39791; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
39792; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v85
39793; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
39794; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v86
39795; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
39796; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v86.l, v87.l, s0
39797; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v87
39798; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v8
39799; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
39800; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v50
39801; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v52.l, v53.l, s18
39802; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v54.l, v55.l, s16
39803; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
39804; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v54
39805; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v64.l, v65.l, s14
39806; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v66.l, v67.l, s12
39807; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v67
39808; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v66
39809; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v70.l, v71.l, s8
39810; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v82.l, v83.l, s4
39811; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v10.l, v9.l, s28
39812; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v12.l, v11.l, s25
39813; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v13.l, s23
39814; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v18.l, v15.l, s21
39815; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v22.l, v21.l, s17
39816; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v26.l, v25.l, s13
39817; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v30.l, v29.l, s9
39818; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v32.l, v31.l, s7
39819; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v34.l, v33.l, s5
39820; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.l, v35.l, s3
39821; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v38.l, v37.l, s1
39822; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo
39823; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v17.l, v16.l, s0
39824; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v68.l, v69.l, s10
39825; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v80.l, v81.l, s6
39826; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.l, v84.l, v85.l, s2
39827; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v20.l, v19.l, s19
39828; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v24.l, v23.l, s15
39829; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v28.l, v27.l, s11
39830; GFX11TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.h
39831; GFX11TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.h
39832; GFX11TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
39833; GFX11TRUE16-NEXT:    v_mov_b16_e32 v21.l, v4.h
39834; GFX11TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.l
39835; GFX11TRUE16-NEXT:    v_mov_b16_e32 v23.l, v3.h
39836; GFX11TRUE16-NEXT:    v_mov_b16_e32 v24.l, v3.l
39837; GFX11TRUE16-NEXT:    v_mov_b16_e32 v25.l, v2.h
39838; GFX11TRUE16-NEXT:    v_mov_b16_e32 v26.l, v2.l
39839; GFX11TRUE16-NEXT:    v_mov_b16_e32 v27.l, v1.h
39840; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v1.l
39841; GFX11TRUE16-NEXT:    v_mov_b16_e32 v29.l, v0.h
39842; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v0.l
39843; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v15.l
39844; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v14.h
39845; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
39846; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
39847; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.l
39848; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.h
39849; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.h
39850; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.h
39851; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.h
39852; GFX11TRUE16-NEXT:    v_mov_b16_e32 v31.l, v9.l
39853; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v8.h
39854; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.l
39855; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
39856; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v18, 0x5040100
39857; GFX11TRUE16-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
39858; GFX11TRUE16-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
39859; GFX11TRUE16-NEXT:    v_perm_b32 v3, v3, v6, 0x5040100
39860; GFX11TRUE16-NEXT:    v_perm_b32 v4, v4, v20, 0x5040100
39861; GFX11TRUE16-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
39862; GFX11TRUE16-NEXT:    v_perm_b32 v6, v12, v21, 0x5040100
39863; GFX11TRUE16-NEXT:    v_perm_b32 v7, v14, v22, 0x5040100
39864; GFX11TRUE16-NEXT:    v_perm_b32 v8, v11, v23, 0x5040100
39865; GFX11TRUE16-NEXT:    v_perm_b32 v9, v16, v24, 0x5040100
39866; GFX11TRUE16-NEXT:    v_perm_b32 v10, v10, v25, 0x5040100
39867; GFX11TRUE16-NEXT:    v_perm_b32 v11, v17, v26, 0x5040100
39868; GFX11TRUE16-NEXT:    v_perm_b32 v12, v31, v27, 0x5040100
39869; GFX11TRUE16-NEXT:    v_perm_b32 v13, v32, v28, 0x5040100
39870; GFX11TRUE16-NEXT:    v_perm_b32 v14, v33, v29, 0x5040100
39871; GFX11TRUE16-NEXT:    v_perm_b32 v15, v15, v30, 0x5040100
39872; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
39873;
39874; GFX11FAKE16-LABEL: v_vselect_v32bf16:
39875; GFX11FAKE16:       ; %bb.0:
39876; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39877; GFX11FAKE16-NEXT:    s_clause 0x1f
39878; GFX11FAKE16-NEXT:    scratch_load_u16 v31, off, s32
39879; GFX11FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:128
39880; GFX11FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:64
39881; GFX11FAKE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
39882; GFX11FAKE16-NEXT:    scratch_load_b32 v35, off, s32 offset:60
39883; GFX11FAKE16-NEXT:    scratch_load_b32 v36, off, s32 offset:120
39884; GFX11FAKE16-NEXT:    scratch_load_b32 v37, off, s32 offset:56
39885; GFX11FAKE16-NEXT:    scratch_load_b32 v38, off, s32 offset:116
39886; GFX11FAKE16-NEXT:    scratch_load_b32 v39, off, s32 offset:52
39887; GFX11FAKE16-NEXT:    scratch_load_b32 v48, off, s32 offset:112
39888; GFX11FAKE16-NEXT:    scratch_load_b32 v49, off, s32 offset:48
39889; GFX11FAKE16-NEXT:    scratch_load_b32 v50, off, s32 offset:108
39890; GFX11FAKE16-NEXT:    scratch_load_b32 v51, off, s32 offset:44
39891; GFX11FAKE16-NEXT:    scratch_load_b32 v52, off, s32 offset:104
39892; GFX11FAKE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
39893; GFX11FAKE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
39894; GFX11FAKE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
39895; GFX11FAKE16-NEXT:    scratch_load_b32 v64, off, s32 offset:96
39896; GFX11FAKE16-NEXT:    scratch_load_b32 v65, off, s32 offset:32
39897; GFX11FAKE16-NEXT:    scratch_load_b32 v66, off, s32 offset:92
39898; GFX11FAKE16-NEXT:    scratch_load_b32 v67, off, s32 offset:28
39899; GFX11FAKE16-NEXT:    scratch_load_b32 v68, off, s32 offset:88
39900; GFX11FAKE16-NEXT:    scratch_load_b32 v69, off, s32 offset:24
39901; GFX11FAKE16-NEXT:    scratch_load_b32 v70, off, s32 offset:84
39902; GFX11FAKE16-NEXT:    scratch_load_b32 v71, off, s32 offset:20
39903; GFX11FAKE16-NEXT:    scratch_load_b32 v80, off, s32 offset:80
39904; GFX11FAKE16-NEXT:    scratch_load_b32 v81, off, s32 offset:16
39905; GFX11FAKE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
39906; GFX11FAKE16-NEXT:    scratch_load_b32 v83, off, s32 offset:12
39907; GFX11FAKE16-NEXT:    scratch_load_b32 v84, off, s32 offset:72
39908; GFX11FAKE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
39909; GFX11FAKE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
39910; GFX11FAKE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
39911; GFX11FAKE16-NEXT:    v_and_b32_e32 v30, 1, v30
39912; GFX11FAKE16-NEXT:    v_and_b32_e32 v28, 1, v28
39913; GFX11FAKE16-NEXT:    v_and_b32_e32 v26, 1, v26
39914; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 1, v24
39915; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 1, v22
39916; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
39917; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
39918; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 1, v20
39919; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 1, v18
39920; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 1, v16
39921; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(30)
39922; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
39923; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
39924; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
39925; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
39926; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
39927; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
39928; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(28)
39929; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
39930; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
39931; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 1, v7
39932; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
39933; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
39934; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
39935; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(26)
39936; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
39937; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
39938; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
39939; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
39940; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
39941; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
39942; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(24)
39943; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
39944; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
39945; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 1, v11
39946; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
39947; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
39948; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
39949; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(22)
39950; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
39951; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
39952; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
39953; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
39954; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
39955; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
39956; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(20)
39957; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
39958; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
39959; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
39960; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
39961; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
39962; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
39963; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(18)
39964; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
39965; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
39966; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
39967; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
39968; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
39969; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
39970; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(16)
39971; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
39972; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
39973; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
39974; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
39975; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
39976; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
39977; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(14)
39978; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
39979; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
39980; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 1, v17
39981; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
39982; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
39983; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(12)
39984; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
39985; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
39986; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 1, v23
39987; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
39988; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
39989; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(10)
39990; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
39991; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
39992; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 1, v21
39993; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
39994; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
39995; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(8)
39996; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
39997; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
39998; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 1, v27
39999; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
40000; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
40001; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(6)
40002; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
40003; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
40004; GFX11FAKE16-NEXT:    v_and_b32_e32 v25, 1, v25
40005; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
40006; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
40007; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(4)
40008; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
40009; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
40010; GFX11FAKE16-NEXT:    v_and_b32_e32 v31, 1, v31
40011; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
40012; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
40013; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(2)
40014; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
40015; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
40016; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 1, v29
40017; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
40018; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
40019; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
40020; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
40021; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
40022; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
40023; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
40024; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
40025; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
40026; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
40027; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
40028; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
40029; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
40030; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
40031; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
40032; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
40033; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
40034; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
40035; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
40036; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
40037; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
40038; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
40039; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
40040; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
40041; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
40042; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
40043; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
40044; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
40045; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
40046; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
40047; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
40048; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
40049; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
40050; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
40051; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
40052; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
40053; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
40054; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
40055; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
40056; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
40057; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
40058; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
40059; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
40060; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
40061; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
40062; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
40063; GFX11FAKE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
40064; GFX11FAKE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
40065; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
40066; GFX11FAKE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
40067; GFX11FAKE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
40068; GFX11FAKE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
40069; GFX11FAKE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
40070; GFX11FAKE16-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
40071; GFX11FAKE16-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
40072; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
40073  %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
40074  ret <32 x bfloat> %op
40075}
40076
40077declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
40078declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40079declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40080declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40081
40082define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
40083; GCN-LABEL: v_fma_bf16:
40084; GCN:       ; %bb.0:
40085; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40086; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40087; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40088; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40089; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40090; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40091; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40092; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
40093; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40094; GCN-NEXT:    s_setpc_b64 s[30:31]
40095;
40096; GFX7-LABEL: v_fma_bf16:
40097; GFX7:       ; %bb.0:
40098; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40099; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40100; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40101; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40102; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40103; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40104; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40105; GFX7-NEXT:    v_fma_f32 v0, v0, v1, v2
40106; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40107; GFX7-NEXT:    s_setpc_b64 s[30:31]
40108;
40109; GFX8-LABEL: v_fma_bf16:
40110; GFX8:       ; %bb.0:
40111; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40112; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
40113; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40114; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40115; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
40116; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
40117; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
40118; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
40119; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40120; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40121; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40122; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40123; GFX8-NEXT:    s_setpc_b64 s[30:31]
40124;
40125; GFX9-LABEL: v_fma_bf16:
40126; GFX9:       ; %bb.0:
40127; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40128; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
40129; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40130; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40131; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
40132; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
40133; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
40134; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
40135; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40136; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40137; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40138; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40139; GFX9-NEXT:    s_setpc_b64 s[30:31]
40140;
40141; GFX10-LABEL: v_fma_bf16:
40142; GFX10:       ; %bb.0:
40143; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40144; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
40145; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40146; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40147; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
40148; GFX10-NEXT:    v_bfe_u32 v0, v2, 16, 1
40149; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v2
40150; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
40151; GFX10-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
40152; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40153; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40154; GFX10-NEXT:    s_setpc_b64 s[30:31]
40155;
40156; GFX11-LABEL: v_fma_bf16:
40157; GFX11:       ; %bb.0:
40158; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40159; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
40160; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40161; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40162; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40163; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
40164; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
40165; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v2
40166; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
40167; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40168; GFX11-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
40169; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
40171; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40172; GFX11-NEXT:    s_setpc_b64 s[30:31]
40173  %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
40174  ret bfloat %op
40175}
40176
40177define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40178; GCN-LABEL: v_fma_v2bf16:
40179; GCN:       ; %bb.0:
40180; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40181; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40182; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40183; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40184; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40185; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40186; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40187; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40188; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40189; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40190; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40191; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40192; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40193; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
40194; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
40195; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40196; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40197; GCN-NEXT:    s_setpc_b64 s[30:31]
40198;
40199; GFX7-LABEL: v_fma_v2bf16:
40200; GFX7:       ; %bb.0:
40201; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40202; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40203; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40204; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40205; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40206; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40207; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40208; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40209; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40210; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40211; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v5
40212; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
40213; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40214; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40215; GFX7-NEXT:    v_fma_f32 v0, v0, v2, v3
40216; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40217; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40218; GFX7-NEXT:    s_setpc_b64 s[30:31]
40219;
40220; GFX8-LABEL: v_fma_v2bf16:
40221; GFX8:       ; %bb.0:
40222; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40223; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
40224; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
40225; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
40226; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
40227; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
40228; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
40229; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40230; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40231; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40232; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
40233; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
40234; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
40235; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40236; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
40237; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
40238; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
40239; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
40240; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40241; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40242; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40243; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40244; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
40245; GFX8-NEXT:    s_setpc_b64 s[30:31]
40246;
40247; GFX9-LABEL: v_fma_v2bf16:
40248; GFX9:       ; %bb.0:
40249; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40250; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
40251; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
40252; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
40253; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
40254; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40255; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40256; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40257; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
40258; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
40259; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
40260; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
40261; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
40262; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40263; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
40264; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
40265; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
40266; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40267; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40268; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40269; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
40270; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
40271; GFX9-NEXT:    s_setpc_b64 s[30:31]
40272;
40273; GFX10-LABEL: v_fma_v2bf16:
40274; GFX10:       ; %bb.0:
40275; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40276; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
40277; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
40278; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
40279; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40280; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40281; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40282; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
40283; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
40284; GFX10-NEXT:    v_bfe_u32 v0, v3, 16, 1
40285; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
40286; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
40287; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
40288; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
40289; GFX10-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
40290; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
40291; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40292; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
40293; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40294; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
40295; GFX10-NEXT:    s_setpc_b64 s[30:31]
40296;
40297; GFX11-LABEL: v_fma_v2bf16:
40298; GFX11:       ; %bb.0:
40299; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40300; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
40301; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
40302; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40303; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40304; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
40305; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40306; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40307; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
40308; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
40309; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
40310; GFX11-NEXT:    v_bfe_u32 v0, v3, 16, 1
40311; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
40312; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
40313; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
40314; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
40315; GFX11-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
40316; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
40317; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40318; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
40319; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40320; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
40321; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
40322; GFX11-NEXT:    s_setpc_b64 s[30:31]
40323  %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
40324  ret <2 x bfloat> %op
40325}
40326
40327define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
40328; GCN-LABEL: v_fma_v3bf16:
40329; GCN:       ; %bb.0:
40330; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40331; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40332; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40333; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
40334; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40335; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40336; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
40337; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40338; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40339; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
40340; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
40341; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40342; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40343; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
40344; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40345; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40346; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
40347; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40348; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40349; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
40350; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
40351; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
40352; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40353; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40354; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40355; GCN-NEXT:    s_setpc_b64 s[30:31]
40356;
40357; GFX7-LABEL: v_fma_v3bf16:
40358; GFX7:       ; %bb.0:
40359; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40360; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40361; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40362; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
40363; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40364; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40365; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
40366; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
40367; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40368; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40369; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40370; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40371; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
40372; GFX7-NEXT:    v_fma_f32 v2, v2, v5, v8
40373; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
40374; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40375; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40376; GFX7-NEXT:    v_fma_f32 v1, v1, v4, v5
40377; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
40378; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40379; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40380; GFX7-NEXT:    v_fma_f32 v0, v0, v3, v4
40381; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40382; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40383; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40384; GFX7-NEXT:    s_setpc_b64 s[30:31]
40385;
40386; GFX8-LABEL: v_fma_v3bf16:
40387; GFX8:       ; %bb.0:
40388; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40389; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
40390; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40391; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40392; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
40393; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
40394; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
40395; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
40396; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
40397; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
40398; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
40399; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
40400; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
40401; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
40402; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
40403; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
40404; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
40405; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
40406; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40407; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40408; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40409; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
40410; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
40411; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
40412; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40413; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
40414; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
40415; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
40416; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
40417; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
40418; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40419; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
40420; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40421; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
40422; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
40423; GFX8-NEXT:    s_setpc_b64 s[30:31]
40424;
40425; GFX9-LABEL: v_fma_v3bf16:
40426; GFX9:       ; %bb.0:
40427; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40428; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
40429; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40430; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40431; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
40432; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
40433; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
40434; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
40435; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
40436; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
40437; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
40438; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
40439; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
40440; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
40441; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
40442; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40443; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40444; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40445; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
40446; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
40447; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
40448; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
40449; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40450; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
40451; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
40452; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
40453; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
40454; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40455; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
40456; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
40457; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
40458; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
40459; GFX9-NEXT:    s_setpc_b64 s[30:31]
40460;
40461; GFX10-LABEL: v_fma_v3bf16:
40462; GFX10:       ; %bb.0:
40463; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40464; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
40465; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
40466; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
40467; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
40468; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40469; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40470; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40471; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40472; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40473; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
40474; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
40475; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
40476; GFX10-NEXT:    v_bfe_u32 v1, v6, 16, 1
40477; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
40478; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
40479; GFX10-NEXT:    v_bfe_u32 v0, v5, 16, 1
40480; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
40481; GFX10-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
40482; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
40483; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
40484; GFX10-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
40485; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
40486; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40487; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
40488; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40489; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
40490; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40491; GFX10-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
40492; GFX10-NEXT:    v_alignbit_b32 v1, s4, v3, 16
40493; GFX10-NEXT:    s_setpc_b64 s[30:31]
40494;
40495; GFX11TRUE16-LABEL: v_fma_v3bf16:
40496; GFX11TRUE16:       ; %bb.0:
40497; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40498; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
40499; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
40500; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
40501; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40502; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40503; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40504; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40505; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40506; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40507; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
40508; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
40509; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40510; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
40511; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
40512; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
40513; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40514; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40515; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
40516; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
40517; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
40518; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
40519; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40520; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
40521; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
40522; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
40523; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40524; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40525; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
40526; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40527; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
40528; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40529; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
40530; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
40531; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v3, 16
40532; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
40533;
40534; GFX11FAKE16-LABEL: v_fma_v3bf16:
40535; GFX11FAKE16:       ; %bb.0:
40536; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40537; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
40538; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
40539; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
40540; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40541; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40542; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40543; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40544; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40545; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40546; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
40547; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
40548; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40549; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
40550; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
40551; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
40552; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40553; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40554; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
40555; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
40556; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
40557; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
40558; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40559; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
40560; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
40561; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
40562; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40563; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40564; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
40565; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40566; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
40567; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40568; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
40569; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
40570; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
40571; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
40572  %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
40573  ret <3 x bfloat> %op
40574}
40575
40576define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
40577; GCN-LABEL: v_fma_v4bf16:
40578; GCN:       ; %bb.0:
40579; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40580; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40581; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40582; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
40583; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40584; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40585; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
40586; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40587; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
40588; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
40589; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40590; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
40591; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
40592; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
40593; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
40594; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40595; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
40596; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
40597; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40598; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
40599; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40600; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40601; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
40602; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40603; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40604; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
40605; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
40606; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
40607; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
40608; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40609; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40610; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40611; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40612; GCN-NEXT:    s_setpc_b64 s[30:31]
40613;
40614; GFX7-LABEL: v_fma_v4bf16:
40615; GFX7:       ; %bb.0:
40616; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40617; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40618; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
40619; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
40620; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40621; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
40622; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
40623; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
40624; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
40625; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40626; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40627; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40628; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
40629; GFX7-NEXT:    v_fma_f32 v3, v3, v7, v11
40630; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
40631; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
40632; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40633; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40634; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40635; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
40636; GFX7-NEXT:    v_fma_f32 v2, v2, v6, v7
40637; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v9
40638; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40639; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40640; GFX7-NEXT:    v_fma_f32 v1, v1, v5, v6
40641; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
40642; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40643; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40644; GFX7-NEXT:    v_fma_f32 v0, v0, v4, v5
40645; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40646; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40647; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40648; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40649; GFX7-NEXT:    s_setpc_b64 s[30:31]
40650;
40651; GFX8-LABEL: v_fma_v4bf16:
40652; GFX8:       ; %bb.0:
40653; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40654; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
40655; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
40656; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
40657; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
40658; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
40659; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
40660; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40661; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40662; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40663; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
40664; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
40665; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
40666; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
40667; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
40668; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
40669; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
40670; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
40671; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
40672; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
40673; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
40674; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
40675; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
40676; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
40677; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
40678; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
40679; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
40680; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
40681; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40682; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40683; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40684; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
40685; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
40686; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
40687; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40688; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
40689; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
40690; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
40691; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
40692; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
40693; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40694; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
40695; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
40696; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40697; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
40698; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
40699; GFX8-NEXT:    s_setpc_b64 s[30:31]
40700;
40701; GFX9-LABEL: v_fma_v4bf16:
40702; GFX9:       ; %bb.0:
40703; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40704; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
40705; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
40706; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
40707; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
40708; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40709; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40710; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40711; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
40712; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
40713; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
40714; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
40715; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
40716; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
40717; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
40718; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
40719; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
40720; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
40721; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
40722; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
40723; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
40724; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
40725; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
40726; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
40727; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40728; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40729; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40730; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
40731; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
40732; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
40733; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
40734; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
40735; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
40736; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
40737; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
40738; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
40739; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40740; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
40741; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
40742; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
40743; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
40744; GFX9-NEXT:    s_setpc_b64 s[30:31]
40745;
40746; GFX10-LABEL: v_fma_v4bf16:
40747; GFX10:       ; %bb.0:
40748; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40749; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
40750; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
40751; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
40752; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40753; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40754; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40755; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
40756; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
40757; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
40758; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
40759; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40760; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40761; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40762; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
40763; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
40764; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
40765; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v6
40766; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
40767; GFX10-NEXT:    v_add3_u32 v0, v10, v6, 0x7fff
40768; GFX10-NEXT:    v_bfe_u32 v2, v5, 16, 1
40769; GFX10-NEXT:    v_bfe_u32 v3, v7, 16, 1
40770; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
40771; GFX10-NEXT:    v_bfe_u32 v8, v4, 16, 1
40772; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
40773; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40774; GFX10-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
40775; GFX10-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
40776; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v7
40777; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
40778; GFX10-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
40779; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
40780; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40781; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
40782; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40783; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
40784; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40785; GFX10-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
40786; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
40787; GFX10-NEXT:    s_setpc_b64 s[30:31]
40788;
40789; GFX11-LABEL: v_fma_v4bf16:
40790; GFX11:       ; %bb.0:
40791; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40792; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
40793; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40794; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
40795; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40796; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
40797; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40798; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
40799; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40800; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
40801; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
40802; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
40803; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40804; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
40805; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
40806; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v6
40807; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
40808; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
40809; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40810; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
40811; GFX11-NEXT:    v_add3_u32 v0, v10, v6, 0x7fff
40812; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
40813; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
40814; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40815; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v8
40816; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
40817; GFX11-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
40818; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
40819; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
40820; GFX11-NEXT:    v_bfe_u32 v3, v7, 16, 1
40821; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
40822; GFX11-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
40823; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
40824; GFX11-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
40825; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v7
40826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40827; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40828; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
40829; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40830; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
40831; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40832; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
40833; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
40834; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
40835; GFX11-NEXT:    s_setpc_b64 s[30:31]
40836  %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
40837  ret <4 x bfloat> %op
40838}
40839
40840declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
40841declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40842declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40843declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40844
40845define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
40846; GCN-LABEL: v_fmuladd_bf16:
40847; GCN:       ; %bb.0:
40848; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40849; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40850; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40851; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40852; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40853; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40854; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
40855; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40856; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
40857; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
40858; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40859; GCN-NEXT:    s_setpc_b64 s[30:31]
40860;
40861; GFX7-LABEL: v_fmuladd_bf16:
40862; GFX7:       ; %bb.0:
40863; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40864; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40865; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40866; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40867; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40868; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40869; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
40870; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40871; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
40872; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
40873; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40874; GFX7-NEXT:    s_setpc_b64 s[30:31]
40875;
40876; GFX8-LABEL: v_fmuladd_bf16:
40877; GFX8:       ; %bb.0:
40878; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40879; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40880; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40881; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
40882; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
40883; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
40884; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
40885; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
40886; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40887; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
40888; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40889; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
40890; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
40891; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
40892; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
40893; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
40894; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40895; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40896; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40897; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40898; GFX8-NEXT:    s_setpc_b64 s[30:31]
40899;
40900; GFX9-LABEL: v_fmuladd_bf16:
40901; GFX9:       ; %bb.0:
40902; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40903; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40904; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40905; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
40906; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
40907; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
40908; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
40909; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
40910; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40911; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
40912; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40913; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
40914; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
40915; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
40916; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
40917; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40918; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
40919; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
40920; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40921; GFX9-NEXT:    s_setpc_b64 s[30:31]
40922;
40923; GFX10-LABEL: v_fmuladd_bf16:
40924; GFX10:       ; %bb.0:
40925; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40926; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40927; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40928; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
40929; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
40930; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
40931; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
40932; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
40933; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
40934; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
40935; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40936; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
40937; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
40938; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40939; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
40940; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
40941; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40942; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40943; GFX10-NEXT:    s_setpc_b64 s[30:31]
40944;
40945; GFX11-LABEL: v_fmuladd_bf16:
40946; GFX11:       ; %bb.0:
40947; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40948; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
40949; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
40950; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40951; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
40952; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
40953; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
40954; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
40955; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40956; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
40957; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
40958; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40959; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40960; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
40961; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40962; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
40963; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
40964; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
40965; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
40966; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40967; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40968; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
40969; GFX11-NEXT:    s_setpc_b64 s[30:31]
40970  %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
40971  ret bfloat %op
40972}
40973
40974define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40975; GCN-LABEL: v_fmuladd_v2bf16:
40976; GCN:       ; %bb.0:
40977; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40978; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
40979; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
40980; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
40981; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
40982; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
40983; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
40984; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
40985; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40986; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
40987; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
40988; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40989; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
40990; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
40991; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
40992; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40993; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40994; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
40995; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
40996; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
40997; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
40998; GCN-NEXT:    s_setpc_b64 s[30:31]
40999;
41000; GFX7-LABEL: v_fmuladd_v2bf16:
41001; GFX7:       ; %bb.0:
41002; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41003; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
41004; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
41005; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
41006; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
41007; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41008; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41009; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41010; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41011; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
41012; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
41013; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
41014; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
41015; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41016; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
41017; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41018; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
41019; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
41020; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
41021; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41022; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41023; GFX7-NEXT:    s_setpc_b64 s[30:31]
41024;
41025; GFX8-LABEL: v_fmuladd_v2bf16:
41026; GFX8:       ; %bb.0:
41027; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41028; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
41029; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
41030; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
41031; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
41032; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
41033; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
41034; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41035; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41036; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
41037; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41038; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
41039; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
41040; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
41041; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
41042; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
41043; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41044; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41045; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
41046; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
41047; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41048; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41049; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
41050; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
41051; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
41052; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
41053; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41054; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41055; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
41056; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41057; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
41058; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
41059; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
41060; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
41061; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
41062; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
41063; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41064; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
41065; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
41066; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
41067; GFX8-NEXT:    s_setpc_b64 s[30:31]
41068;
41069; GFX9-LABEL: v_fmuladd_v2bf16:
41070; GFX9:       ; %bb.0:
41071; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41072; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
41073; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
41074; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
41075; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
41076; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
41077; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
41078; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41079; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41080; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
41081; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41082; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
41083; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
41084; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41085; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41086; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
41087; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
41088; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
41089; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41090; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41091; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
41092; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
41093; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
41094; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41095; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41096; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
41097; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41098; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
41099; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
41100; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
41101; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
41102; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
41103; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41104; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
41105; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
41106; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
41107; GFX9-NEXT:    s_setpc_b64 s[30:31]
41108;
41109; GFX10-LABEL: v_fmuladd_v2bf16:
41110; GFX10:       ; %bb.0:
41111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41112; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
41113; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
41114; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41115; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41116; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
41117; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
41118; GFX10-NEXT:    v_bfe_u32 v1, v3, 16, 1
41119; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41120; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
41121; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41122; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
41123; GFX10-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
41124; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41125; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
41126; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41127; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41128; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41129; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41130; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
41131; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
41132; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41133; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
41134; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
41135; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
41136; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41137; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
41138; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41139; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41140; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
41141; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41142; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41143; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41144; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
41145; GFX10-NEXT:    s_setpc_b64 s[30:31]
41146;
41147; GFX11-LABEL: v_fmuladd_v2bf16:
41148; GFX11:       ; %bb.0:
41149; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41150; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
41151; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
41152; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41153; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
41154; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
41155; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
41156; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41157; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
41158; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41159; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41160; GFX11-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
41161; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41162; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
41163; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
41164; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
41165; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41166; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41167; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
41168; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41169; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41170; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41171; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
41172; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
41173; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41174; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
41175; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41176; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41177; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
41178; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
41179; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
41180; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41181; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41182; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41183; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
41184; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41185; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41186; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41187; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41188; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
41189; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
41190; GFX11-NEXT:    s_setpc_b64 s[30:31]
41191  %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
41192  ret <2 x bfloat> %op
41193}
41194
41195define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
41196; GCN-LABEL: v_fmuladd_v3bf16:
41197; GCN:       ; %bb.0:
41198; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41199; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
41200; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
41201; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
41202; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
41203; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
41204; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
41205; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
41206; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
41207; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
41208; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
41209; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41210; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
41211; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41212; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41213; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
41214; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41215; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41216; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
41217; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
41218; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
41219; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
41220; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41221; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41222; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41223; GCN-NEXT:    v_add_f32_e32 v2, v2, v8
41224; GCN-NEXT:    v_add_f32_e32 v1, v1, v7
41225; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
41226; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41227; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41228; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41229; GCN-NEXT:    s_setpc_b64 s[30:31]
41230;
41231; GFX7-LABEL: v_fmuladd_v3bf16:
41232; GFX7:       ; %bb.0:
41233; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41234; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
41235; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
41236; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
41237; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
41238; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
41239; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
41240; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
41241; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41242; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41243; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41244; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41245; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41246; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
41247; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
41248; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
41249; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
41250; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
41251; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
41252; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41253; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
41254; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41255; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
41256; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41257; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
41258; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
41259; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
41260; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
41261; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41262; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41263; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41264; GFX7-NEXT:    s_setpc_b64 s[30:31]
41265;
41266; GFX8-LABEL: v_fmuladd_v3bf16:
41267; GFX8:       ; %bb.0:
41268; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41269; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
41270; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
41271; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
41272; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
41273; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
41274; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
41275; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41276; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41277; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
41278; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41279; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
41280; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
41281; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
41282; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
41283; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
41284; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
41285; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
41286; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41287; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
41288; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41289; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
41290; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
41291; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
41292; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
41293; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
41294; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
41295; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41296; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
41297; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41298; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41299; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
41300; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
41301; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
41302; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41303; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41304; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
41305; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
41306; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
41307; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41308; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
41309; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
41310; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
41311; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
41312; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41313; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41314; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
41315; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41316; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
41317; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
41318; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
41319; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
41320; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
41321; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41322; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41323; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
41324; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
41325; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
41326; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
41327; GFX8-NEXT:    s_setpc_b64 s[30:31]
41328;
41329; GFX9-LABEL: v_fmuladd_v3bf16:
41330; GFX9:       ; %bb.0:
41331; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41332; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
41333; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
41334; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
41335; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
41336; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
41337; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
41338; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41339; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41340; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
41341; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41342; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
41343; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
41344; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
41345; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
41346; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
41347; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41348; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
41349; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41350; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
41351; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
41352; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
41353; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
41354; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
41355; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41356; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
41357; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41358; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41359; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
41360; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41361; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41362; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
41363; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
41364; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
41365; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
41366; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41367; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
41368; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
41369; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
41370; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41371; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41372; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
41373; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41374; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
41375; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
41376; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
41377; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
41378; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41379; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41380; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
41381; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
41382; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
41383; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
41384; GFX9-NEXT:    s_setpc_b64 s[30:31]
41385;
41386; GFX10-LABEL: v_fmuladd_v3bf16:
41387; GFX10:       ; %bb.0:
41388; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41389; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
41390; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
41391; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
41392; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
41393; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41394; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41395; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
41396; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
41397; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
41398; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
41399; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41400; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
41401; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41402; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
41403; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41404; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
41405; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
41406; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v0
41407; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
41408; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41409; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41410; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
41411; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41412; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41413; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41414; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41415; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41416; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
41417; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41418; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41419; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41420; GFX10-NEXT:    v_add_f32_e32 v2, v2, v5
41421; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41422; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
41423; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
41424; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
41425; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
41426; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
41427; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
41428; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
41429; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
41430; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
41431; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41432; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
41433; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41434; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41435; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41436; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
41437; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41438; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
41439; GFX10-NEXT:    s_setpc_b64 s[30:31]
41440;
41441; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
41442; GFX11TRUE16:       ; %bb.0:
41443; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41444; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
41445; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
41446; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41447; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41448; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41449; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41450; GFX11TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
41451; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
41452; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41453; GFX11TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
41454; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
41455; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, v1, v3
41456; GFX11TRUE16-NEXT:    v_mul_f32_e32 v3, v7, v6
41457; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41458; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
41459; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41460; GFX11TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
41461; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41462; GFX11TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
41463; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41464; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41465; GFX11TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
41466; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41467; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41468; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
41469; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41470; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41471; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41472; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41473; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41474; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41475; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41476; GFX11TRUE16-NEXT:    v_add_f32_e32 v2, v2, v5
41477; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41478; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41479; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41480; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
41481; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41482; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41483; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
41484; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
41485; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
41486; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41487; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
41488; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
41489; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
41490; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41491; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
41492; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
41493; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41494; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41495; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41496; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41497; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41498; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
41499; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41500; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
41501; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
41502; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
41503;
41504; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
41505; GFX11FAKE16:       ; %bb.0:
41506; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41507; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
41508; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
41509; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41510; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41511; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41512; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41513; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
41514; GFX11FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
41515; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41516; GFX11FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
41517; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
41518; GFX11FAKE16-NEXT:    v_mul_f32_e32 v1, v1, v3
41519; GFX11FAKE16-NEXT:    v_mul_f32_e32 v3, v7, v6
41520; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41521; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
41522; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41523; GFX11FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
41524; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41525; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
41526; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41527; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41528; GFX11FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
41529; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41530; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41531; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
41532; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41533; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41534; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41535; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41536; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41537; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41538; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41539; GFX11FAKE16-NEXT:    v_add_f32_e32 v2, v2, v5
41540; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41541; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41542; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41543; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
41544; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41545; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41546; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
41547; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
41548; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
41549; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41550; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
41551; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
41552; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
41553; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41554; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
41555; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
41556; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41557; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41558; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41559; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41560; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41561; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
41562; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41563; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
41564; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
41565; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
41566  %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
41567  ret <3 x bfloat> %op
41568}
41569
41570define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
41571; GCN-LABEL: v_fmuladd_v4bf16:
41572; GCN:       ; %bb.0:
41573; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41574; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
41575; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
41576; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
41577; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
41578; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
41579; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
41580; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
41581; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
41582; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
41583; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
41584; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
41585; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
41586; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
41587; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41588; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
41589; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
41590; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41591; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
41592; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
41593; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41594; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
41595; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41596; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41597; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
41598; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
41599; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
41600; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
41601; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
41602; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41603; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41604; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41605; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41606; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
41607; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
41608; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
41609; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
41610; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41611; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41612; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41613; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41614; GCN-NEXT:    s_setpc_b64 s[30:31]
41615;
41616; GFX7-LABEL: v_fmuladd_v4bf16:
41617; GFX7:       ; %bb.0:
41618; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41619; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
41620; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
41621; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
41622; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
41623; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
41624; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
41625; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
41626; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
41627; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
41628; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41629; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
41630; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41631; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
41632; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41633; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41634; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41635; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
41636; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
41637; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
41638; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
41639; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
41640; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
41641; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
41642; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
41643; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41644; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
41645; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41646; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
41647; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41648; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
41649; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41650; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
41651; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
41652; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
41653; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
41654; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
41655; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41656; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41657; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41658; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41659; GFX7-NEXT:    s_setpc_b64 s[30:31]
41660;
41661; GFX8-LABEL: v_fmuladd_v4bf16:
41662; GFX8:       ; %bb.0:
41663; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41664; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
41665; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
41666; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
41667; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
41668; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
41669; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
41670; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
41671; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
41672; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
41673; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
41674; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
41675; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
41676; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
41677; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
41678; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
41679; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41680; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41681; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
41682; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
41683; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
41684; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
41685; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
41686; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
41687; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
41688; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
41689; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
41690; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41691; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
41692; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41693; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
41694; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
41695; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
41696; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
41697; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
41698; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
41699; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41700; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
41701; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41702; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
41703; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
41704; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
41705; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
41706; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
41707; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
41708; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41709; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
41710; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41711; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41712; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
41713; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
41714; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
41715; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41716; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41717; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
41718; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
41719; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
41720; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41721; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
41722; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
41723; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
41724; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
41725; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41726; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41727; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
41728; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41729; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
41730; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
41731; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
41732; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
41733; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
41734; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41735; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41736; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
41737; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
41738; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
41739; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
41740; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
41741; GFX8-NEXT:    s_setpc_b64 s[30:31]
41742;
41743; GFX9-LABEL: v_fmuladd_v4bf16:
41744; GFX9:       ; %bb.0:
41745; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41746; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
41747; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
41748; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
41749; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
41750; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
41751; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
41752; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
41753; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
41754; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
41755; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
41756; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
41757; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
41758; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41759; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41760; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
41761; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
41762; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
41763; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
41764; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
41765; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
41766; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
41767; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
41768; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v1
41769; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41770; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
41771; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41772; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
41773; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
41774; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
41775; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
41776; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
41777; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
41778; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
41779; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
41780; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
41781; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
41782; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
41783; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
41784; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
41785; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41786; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
41787; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41788; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
41789; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
41790; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41791; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41792; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
41793; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
41794; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
41795; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
41796; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
41797; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
41798; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
41799; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
41800; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
41801; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41802; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
41803; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41804; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
41805; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
41806; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
41807; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
41808; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
41809; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
41810; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
41811; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
41812; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
41813; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
41814; GFX9-NEXT:    s_setpc_b64 s[30:31]
41815;
41816; GFX10-LABEL: v_fmuladd_v4bf16:
41817; GFX10:       ; %bb.0:
41818; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41819; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
41820; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
41821; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41822; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41823; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
41824; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41825; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
41826; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
41827; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41828; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
41829; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
41830; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
41831; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
41832; GFX10-NEXT:    v_mul_f32_e32 v7, v9, v7
41833; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
41834; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
41835; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
41836; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
41837; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41838; GFX10-NEXT:    v_bfe_u32 v9, v7, 16, 1
41839; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41840; GFX10-NEXT:    v_bfe_u32 v11, v0, 16, 1
41841; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc_lo
41842; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41843; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v7
41844; GFX10-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
41845; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
41846; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41847; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41848; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
41849; GFX10-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
41850; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
41851; GFX10-NEXT:    v_add_f32_e32 v3, v3, v8
41852; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41853; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc_lo
41854; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41855; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
41856; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41857; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
41858; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
41859; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41860; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
41861; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41862; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41863; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
41864; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41865; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
41866; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
41867; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
41868; GFX10-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
41869; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
41870; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
41871; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41872; GFX10-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
41873; GFX10-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
41874; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
41875; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
41876; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
41877; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
41878; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41879; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41880; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41881; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41882; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
41883; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41884; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
41885; GFX10-NEXT:    s_setpc_b64 s[30:31]
41886;
41887; GFX11-LABEL: v_fmuladd_v4bf16:
41888; GFX11:       ; %bb.0:
41889; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41890; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
41891; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41892; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
41893; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
41894; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
41895; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
41896; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41897; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41898; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
41899; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
41900; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
41901; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41902; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
41903; GFX11-NEXT:    v_mul_f32_e32 v7, v9, v7
41904; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v6
41905; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
41906; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41907; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
41908; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
41909; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
41910; GFX11-NEXT:    v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
41911; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
41912; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41913; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v7
41914; GFX11-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
41915; GFX11-NEXT:    v_bfe_u32 v11, v0, 16, 1
41916; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
41917; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
41918; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41919; GFX11-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
41920; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
41921; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
41922; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
41923; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41924; GFX11-NEXT:    v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
41925; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41926; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
41927; GFX11-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41928; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
41929; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
41930; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
41931; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41932; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
41933; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
41934; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
41935; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
41936; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
41937; GFX11-NEXT:    v_add_f32_e32 v3, v3, v8
41938; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
41939; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41940; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
41941; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
41942; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
41943; GFX11-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
41944; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
41945; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41946; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41947; GFX11-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
41948; GFX11-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
41949; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
41950; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
41951; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
41952; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
41953; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41954; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41955; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
41956; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41957; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
41958; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41959; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
41960; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41961; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
41962; GFX11-NEXT:    s_setpc_b64 s[30:31]
41963  %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
41964  ret <4 x bfloat> %op
41965}
41966