xref: /llvm-project/llvm/test/CodeGen/AMDGPU/select.f16.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
5
6define amdgpu_kernel void @select_f16(
7; SI-LABEL: select_f16:
8; SI:       ; %bb.0: ; %entry
9; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
13; SI-NEXT:    s_mov_b32 s18, s2
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b32 s16, s10
16; SI-NEXT:    s_mov_b32 s17, s11
17; SI-NEXT:    s_mov_b32 s19, s3
18; SI-NEXT:    s_mov_b32 s20, s12
19; SI-NEXT:    s_mov_b32 s21, s13
20; SI-NEXT:    s_mov_b32 s22, s2
21; SI-NEXT:    s_mov_b32 s23, s3
22; SI-NEXT:    s_mov_b32 s12, s14
23; SI-NEXT:    s_mov_b32 s13, s15
24; SI-NEXT:    s_mov_b32 s14, s2
25; SI-NEXT:    s_mov_b32 s15, s3
26; SI-NEXT:    s_mov_b32 s6, s2
27; SI-NEXT:    s_mov_b32 s7, s3
28; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
29; SI-NEXT:    s_waitcnt vmcnt(0)
30; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
31; SI-NEXT:    s_waitcnt vmcnt(0)
32; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
33; SI-NEXT:    s_waitcnt vmcnt(0)
34; SI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 glc
35; SI-NEXT:    s_waitcnt vmcnt(0)
36; SI-NEXT:    s_mov_b32 s0, s8
37; SI-NEXT:    s_mov_b32 s1, s9
38; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
39; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
40; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
41; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
42; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
43; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
44; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
45; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
46; SI-NEXT:    s_endpgm
47;
48; VI-LABEL: select_f16:
49; VI:       ; %bb.0: ; %entry
50; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
51; VI-NEXT:    s_mov_b32 s3, 0xf000
52; VI-NEXT:    s_mov_b32 s2, -1
53; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x44
54; VI-NEXT:    s_mov_b32 s18, s2
55; VI-NEXT:    s_waitcnt lgkmcnt(0)
56; VI-NEXT:    s_mov_b32 s16, s10
57; VI-NEXT:    s_mov_b32 s17, s11
58; VI-NEXT:    s_mov_b32 s19, s3
59; VI-NEXT:    s_mov_b32 s20, s12
60; VI-NEXT:    s_mov_b32 s21, s13
61; VI-NEXT:    s_mov_b32 s22, s2
62; VI-NEXT:    s_mov_b32 s23, s3
63; VI-NEXT:    s_mov_b32 s12, s14
64; VI-NEXT:    s_mov_b32 s13, s15
65; VI-NEXT:    s_mov_b32 s14, s2
66; VI-NEXT:    s_mov_b32 s15, s3
67; VI-NEXT:    s_mov_b32 s6, s2
68; VI-NEXT:    s_mov_b32 s7, s3
69; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
70; VI-NEXT:    s_waitcnt vmcnt(0)
71; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
72; VI-NEXT:    s_waitcnt vmcnt(0)
73; VI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
74; VI-NEXT:    s_waitcnt vmcnt(0)
75; VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 glc
76; VI-NEXT:    s_waitcnt vmcnt(0)
77; VI-NEXT:    s_mov_b32 s0, s8
78; VI-NEXT:    s_mov_b32 s1, s9
79; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
80; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
81; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
82; VI-NEXT:    s_endpgm
83;
84; GFX11-LABEL: select_f16:
85; GFX11:       ; %bb.0: ; %entry
86; GFX11-NEXT:    s_clause 0x1
87; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
88; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
89; GFX11-NEXT:    s_mov_b32 s6, -1
90; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
91; GFX11-NEXT:    s_mov_b32 s18, s6
92; GFX11-NEXT:    s_mov_b32 s19, s7
93; GFX11-NEXT:    s_mov_b32 s22, s6
94; GFX11-NEXT:    s_mov_b32 s23, s7
95; GFX11-NEXT:    s_mov_b32 s26, s6
96; GFX11-NEXT:    s_mov_b32 s27, s7
97; GFX11-NEXT:    s_mov_b32 s2, s6
98; GFX11-NEXT:    s_mov_b32 s3, s7
99; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX11-NEXT:    s_mov_b32 s16, s10
101; GFX11-NEXT:    s_mov_b32 s17, s11
102; GFX11-NEXT:    s_mov_b32 s20, s12
103; GFX11-NEXT:    s_mov_b32 s21, s13
104; GFX11-NEXT:    s_mov_b32 s24, s14
105; GFX11-NEXT:    s_mov_b32 s25, s15
106; GFX11-NEXT:    buffer_load_u16 v0, off, s[16:19], 0 glc dlc
107; GFX11-NEXT:    s_waitcnt vmcnt(0)
108; GFX11-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
109; GFX11-NEXT:    s_waitcnt vmcnt(0)
110; GFX11-NEXT:    buffer_load_u16 v2, off, s[24:27], 0 glc dlc
111; GFX11-NEXT:    s_waitcnt vmcnt(0)
112; GFX11-NEXT:    buffer_load_u16 v3, off, s[0:3], 0 glc dlc
113; GFX11-NEXT:    s_waitcnt vmcnt(0)
114; GFX11-NEXT:    s_mov_b32 s4, s8
115; GFX11-NEXT:    s_mov_b32 s5, s9
116; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
117; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
118; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
119; GFX11-NEXT:    s_endpgm
120
121    ptr addrspace(1) %r,
122    ptr addrspace(1) %a,
123    ptr addrspace(1) %b,
124    ptr addrspace(1) %c,
125    ptr addrspace(1) %d) {
126entry:
127  %a.val = load volatile half, ptr addrspace(1) %a
128  %b.val = load volatile half, ptr addrspace(1) %b
129  %c.val = load volatile half, ptr addrspace(1) %c
130  %d.val = load volatile half, ptr addrspace(1) %d
131  %fcmp = fcmp olt half %a.val, %b.val
132  %r.val = select i1 %fcmp, half %c.val, half %d.val
133  store half %r.val, ptr addrspace(1) %r
134  ret void
135}
136
137define amdgpu_kernel void @select_f16_imm_a(
138; SI-LABEL: select_f16_imm_a:
139; SI:       ; %bb.0: ; %entry
140; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
141; SI-NEXT:    s_mov_b32 s11, 0xf000
142; SI-NEXT:    s_mov_b32 s10, -1
143; SI-NEXT:    s_mov_b32 s14, s10
144; SI-NEXT:    s_mov_b32 s15, s11
145; SI-NEXT:    s_waitcnt lgkmcnt(0)
146; SI-NEXT:    s_mov_b32 s12, s2
147; SI-NEXT:    s_mov_b32 s13, s3
148; SI-NEXT:    s_mov_b32 s16, s4
149; SI-NEXT:    s_mov_b32 s17, s5
150; SI-NEXT:    s_mov_b32 s18, s10
151; SI-NEXT:    s_mov_b32 s19, s11
152; SI-NEXT:    s_mov_b32 s4, s6
153; SI-NEXT:    s_mov_b32 s5, s7
154; SI-NEXT:    s_mov_b32 s6, s10
155; SI-NEXT:    s_mov_b32 s7, s11
156; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
157; SI-NEXT:    s_waitcnt vmcnt(0)
158; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
159; SI-NEXT:    s_waitcnt vmcnt(0)
160; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    s_mov_b32 s8, s0
163; SI-NEXT:    s_mov_b32 s9, s1
164; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
165; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
166; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
167; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
168; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
169; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
170; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
171; SI-NEXT:    s_endpgm
172;
173; VI-LABEL: select_f16_imm_a:
174; VI:       ; %bb.0: ; %entry
175; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
176; VI-NEXT:    s_mov_b32 s11, 0xf000
177; VI-NEXT:    s_mov_b32 s10, -1
178; VI-NEXT:    s_mov_b32 s14, s10
179; VI-NEXT:    s_mov_b32 s15, s11
180; VI-NEXT:    s_waitcnt lgkmcnt(0)
181; VI-NEXT:    s_mov_b32 s12, s2
182; VI-NEXT:    s_mov_b32 s13, s3
183; VI-NEXT:    s_mov_b32 s16, s4
184; VI-NEXT:    s_mov_b32 s17, s5
185; VI-NEXT:    s_mov_b32 s18, s10
186; VI-NEXT:    s_mov_b32 s19, s11
187; VI-NEXT:    s_mov_b32 s4, s6
188; VI-NEXT:    s_mov_b32 s5, s7
189; VI-NEXT:    s_mov_b32 s6, s10
190; VI-NEXT:    s_mov_b32 s7, s11
191; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
192; VI-NEXT:    s_waitcnt vmcnt(0)
193; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
194; VI-NEXT:    s_waitcnt vmcnt(0)
195; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
196; VI-NEXT:    s_waitcnt vmcnt(0)
197; VI-NEXT:    s_mov_b32 s8, s0
198; VI-NEXT:    s_mov_b32 s9, s1
199; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
200; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
201; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
202; VI-NEXT:    s_endpgm
203;
204; GFX11-LABEL: select_f16_imm_a:
205; GFX11:       ; %bb.0: ; %entry
206; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
207; GFX11-NEXT:    s_mov_b32 s10, -1
208; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
209; GFX11-NEXT:    s_mov_b32 s14, s10
210; GFX11-NEXT:    s_mov_b32 s15, s11
211; GFX11-NEXT:    s_mov_b32 s18, s10
212; GFX11-NEXT:    s_mov_b32 s19, s11
213; GFX11-NEXT:    s_mov_b32 s22, s10
214; GFX11-NEXT:    s_mov_b32 s23, s11
215; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX11-NEXT:    s_mov_b32 s12, s2
217; GFX11-NEXT:    s_mov_b32 s13, s3
218; GFX11-NEXT:    s_mov_b32 s16, s4
219; GFX11-NEXT:    s_mov_b32 s17, s5
220; GFX11-NEXT:    s_mov_b32 s20, s6
221; GFX11-NEXT:    s_mov_b32 s21, s7
222; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
223; GFX11-NEXT:    s_waitcnt vmcnt(0)
224; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
225; GFX11-NEXT:    s_waitcnt vmcnt(0)
226; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
227; GFX11-NEXT:    s_waitcnt vmcnt(0)
228; GFX11-NEXT:    s_mov_b32 s8, s0
229; GFX11-NEXT:    s_mov_b32 s9, s1
230; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
231; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
232; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
233; GFX11-NEXT:    s_endpgm
234    ptr addrspace(1) %r,
235    ptr addrspace(1) %b,
236    ptr addrspace(1) %c,
237    ptr addrspace(1) %d) {
238entry:
239  %b.val = load volatile half, ptr addrspace(1) %b
240  %c.val = load volatile half, ptr addrspace(1) %c
241  %d.val = load volatile half, ptr addrspace(1) %d
242  %fcmp = fcmp olt half 0xH3800, %b.val
243  %r.val = select i1 %fcmp, half %c.val, half %d.val
244  store half %r.val, ptr addrspace(1) %r
245  ret void
246}
247
248define amdgpu_kernel void @select_f16_imm_b(
249; SI-LABEL: select_f16_imm_b:
250; SI:       ; %bb.0: ; %entry
251; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
252; SI-NEXT:    s_mov_b32 s11, 0xf000
253; SI-NEXT:    s_mov_b32 s10, -1
254; SI-NEXT:    s_mov_b32 s14, s10
255; SI-NEXT:    s_mov_b32 s15, s11
256; SI-NEXT:    s_waitcnt lgkmcnt(0)
257; SI-NEXT:    s_mov_b32 s12, s2
258; SI-NEXT:    s_mov_b32 s13, s3
259; SI-NEXT:    s_mov_b32 s16, s4
260; SI-NEXT:    s_mov_b32 s17, s5
261; SI-NEXT:    s_mov_b32 s18, s10
262; SI-NEXT:    s_mov_b32 s19, s11
263; SI-NEXT:    s_mov_b32 s4, s6
264; SI-NEXT:    s_mov_b32 s5, s7
265; SI-NEXT:    s_mov_b32 s6, s10
266; SI-NEXT:    s_mov_b32 s7, s11
267; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
268; SI-NEXT:    s_waitcnt vmcnt(0)
269; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
270; SI-NEXT:    s_waitcnt vmcnt(0)
271; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
272; SI-NEXT:    s_waitcnt vmcnt(0)
273; SI-NEXT:    s_mov_b32 s8, s0
274; SI-NEXT:    s_mov_b32 s9, s1
275; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
276; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
277; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
278; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
279; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
280; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
281; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
282; SI-NEXT:    s_endpgm
283;
284; VI-LABEL: select_f16_imm_b:
285; VI:       ; %bb.0: ; %entry
286; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
287; VI-NEXT:    s_mov_b32 s11, 0xf000
288; VI-NEXT:    s_mov_b32 s10, -1
289; VI-NEXT:    s_mov_b32 s14, s10
290; VI-NEXT:    s_mov_b32 s15, s11
291; VI-NEXT:    s_waitcnt lgkmcnt(0)
292; VI-NEXT:    s_mov_b32 s12, s2
293; VI-NEXT:    s_mov_b32 s13, s3
294; VI-NEXT:    s_mov_b32 s16, s4
295; VI-NEXT:    s_mov_b32 s17, s5
296; VI-NEXT:    s_mov_b32 s18, s10
297; VI-NEXT:    s_mov_b32 s19, s11
298; VI-NEXT:    s_mov_b32 s4, s6
299; VI-NEXT:    s_mov_b32 s5, s7
300; VI-NEXT:    s_mov_b32 s6, s10
301; VI-NEXT:    s_mov_b32 s7, s11
302; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
303; VI-NEXT:    s_waitcnt vmcnt(0)
304; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
305; VI-NEXT:    s_waitcnt vmcnt(0)
306; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
307; VI-NEXT:    s_waitcnt vmcnt(0)
308; VI-NEXT:    s_mov_b32 s8, s0
309; VI-NEXT:    s_mov_b32 s9, s1
310; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
311; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
312; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
313; VI-NEXT:    s_endpgm
314;
315; GFX11-LABEL: select_f16_imm_b:
316; GFX11:       ; %bb.0: ; %entry
317; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
318; GFX11-NEXT:    s_mov_b32 s10, -1
319; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
320; GFX11-NEXT:    s_mov_b32 s14, s10
321; GFX11-NEXT:    s_mov_b32 s15, s11
322; GFX11-NEXT:    s_mov_b32 s18, s10
323; GFX11-NEXT:    s_mov_b32 s19, s11
324; GFX11-NEXT:    s_mov_b32 s22, s10
325; GFX11-NEXT:    s_mov_b32 s23, s11
326; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX11-NEXT:    s_mov_b32 s12, s2
328; GFX11-NEXT:    s_mov_b32 s13, s3
329; GFX11-NEXT:    s_mov_b32 s16, s4
330; GFX11-NEXT:    s_mov_b32 s17, s5
331; GFX11-NEXT:    s_mov_b32 s20, s6
332; GFX11-NEXT:    s_mov_b32 s21, s7
333; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
334; GFX11-NEXT:    s_waitcnt vmcnt(0)
335; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
336; GFX11-NEXT:    s_waitcnt vmcnt(0)
337; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
338; GFX11-NEXT:    s_waitcnt vmcnt(0)
339; GFX11-NEXT:    s_mov_b32 s8, s0
340; GFX11-NEXT:    s_mov_b32 s9, s1
341; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
342; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
343; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
344; GFX11-NEXT:    s_endpgm
345    ptr addrspace(1) %r,
346    ptr addrspace(1) %a,
347    ptr addrspace(1) %c,
348    ptr addrspace(1) %d) {
349entry:
350  %a.val = load volatile half, ptr addrspace(1) %a
351  %c.val = load volatile half, ptr addrspace(1) %c
352  %d.val = load volatile half, ptr addrspace(1) %d
353  %fcmp = fcmp olt half %a.val, 0xH3800
354  %r.val = select i1 %fcmp, half %c.val, half %d.val
355  store half %r.val, ptr addrspace(1) %r
356  ret void
357}
358
359define amdgpu_kernel void @select_f16_imm_c(
360; SI-LABEL: select_f16_imm_c:
361; SI:       ; %bb.0: ; %entry
362; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
363; SI-NEXT:    s_mov_b32 s11, 0xf000
364; SI-NEXT:    s_mov_b32 s10, -1
365; SI-NEXT:    s_mov_b32 s14, s10
366; SI-NEXT:    s_mov_b32 s15, s11
367; SI-NEXT:    s_waitcnt lgkmcnt(0)
368; SI-NEXT:    s_mov_b32 s12, s2
369; SI-NEXT:    s_mov_b32 s13, s3
370; SI-NEXT:    s_mov_b32 s16, s4
371; SI-NEXT:    s_mov_b32 s17, s5
372; SI-NEXT:    s_mov_b32 s18, s10
373; SI-NEXT:    s_mov_b32 s19, s11
374; SI-NEXT:    s_mov_b32 s4, s6
375; SI-NEXT:    s_mov_b32 s5, s7
376; SI-NEXT:    s_mov_b32 s6, s10
377; SI-NEXT:    s_mov_b32 s7, s11
378; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
379; SI-NEXT:    s_waitcnt vmcnt(0)
380; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
381; SI-NEXT:    s_waitcnt vmcnt(0)
382; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
383; SI-NEXT:    s_waitcnt vmcnt(0)
384; SI-NEXT:    s_mov_b32 s8, s0
385; SI-NEXT:    s_mov_b32 s9, s1
386; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
387; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
388; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
389; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
390; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
391; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
392; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
393; SI-NEXT:    s_endpgm
394;
395; VI-LABEL: select_f16_imm_c:
396; VI:       ; %bb.0: ; %entry
397; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
398; VI-NEXT:    s_mov_b32 s11, 0xf000
399; VI-NEXT:    s_mov_b32 s10, -1
400; VI-NEXT:    s_mov_b32 s14, s10
401; VI-NEXT:    s_mov_b32 s15, s11
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    s_mov_b32 s12, s2
404; VI-NEXT:    s_mov_b32 s13, s3
405; VI-NEXT:    s_mov_b32 s16, s4
406; VI-NEXT:    s_mov_b32 s17, s5
407; VI-NEXT:    s_mov_b32 s18, s10
408; VI-NEXT:    s_mov_b32 s19, s11
409; VI-NEXT:    s_mov_b32 s4, s6
410; VI-NEXT:    s_mov_b32 s5, s7
411; VI-NEXT:    s_mov_b32 s6, s10
412; VI-NEXT:    s_mov_b32 s7, s11
413; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
414; VI-NEXT:    s_waitcnt vmcnt(0)
415; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
418; VI-NEXT:    s_waitcnt vmcnt(0)
419; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
420; VI-NEXT:    s_mov_b32 s8, s0
421; VI-NEXT:    s_mov_b32 s9, s1
422; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
423; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
424; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
425; VI-NEXT:    s_endpgm
426;
427; GFX11-LABEL: select_f16_imm_c:
428; GFX11:       ; %bb.0: ; %entry
429; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
430; GFX11-NEXT:    s_mov_b32 s10, -1
431; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
432; GFX11-NEXT:    s_mov_b32 s14, s10
433; GFX11-NEXT:    s_mov_b32 s15, s11
434; GFX11-NEXT:    s_mov_b32 s18, s10
435; GFX11-NEXT:    s_mov_b32 s19, s11
436; GFX11-NEXT:    s_mov_b32 s22, s10
437; GFX11-NEXT:    s_mov_b32 s23, s11
438; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX11-NEXT:    s_mov_b32 s12, s2
440; GFX11-NEXT:    s_mov_b32 s13, s3
441; GFX11-NEXT:    s_mov_b32 s16, s4
442; GFX11-NEXT:    s_mov_b32 s17, s5
443; GFX11-NEXT:    s_mov_b32 s20, s6
444; GFX11-NEXT:    s_mov_b32 s21, s7
445; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
446; GFX11-NEXT:    s_waitcnt vmcnt(0)
447; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
448; GFX11-NEXT:    s_waitcnt vmcnt(0)
449; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
450; GFX11-NEXT:    s_waitcnt vmcnt(0)
451; GFX11-NEXT:    s_mov_b32 s8, s0
452; GFX11-NEXT:    s_mov_b32 s9, s1
453; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
454; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
455; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
456; GFX11-NEXT:    s_endpgm
457    ptr addrspace(1) %r,
458    ptr addrspace(1) %a,
459    ptr addrspace(1) %b,
460    ptr addrspace(1) %d) {
461entry:
462  %a.val = load volatile half, ptr addrspace(1) %a
463  %b.val = load volatile half, ptr addrspace(1) %b
464  %d.val = load volatile half, ptr addrspace(1) %d
465  %fcmp = fcmp olt half %a.val, %b.val
466  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
467  store half %r.val, ptr addrspace(1) %r
468  ret void
469}
470
471define amdgpu_kernel void @select_f16_imm_d(
472; SI-LABEL: select_f16_imm_d:
473; SI:       ; %bb.0: ; %entry
474; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
475; SI-NEXT:    s_mov_b32 s11, 0xf000
476; SI-NEXT:    s_mov_b32 s10, -1
477; SI-NEXT:    s_mov_b32 s14, s10
478; SI-NEXT:    s_mov_b32 s15, s11
479; SI-NEXT:    s_waitcnt lgkmcnt(0)
480; SI-NEXT:    s_mov_b32 s12, s2
481; SI-NEXT:    s_mov_b32 s13, s3
482; SI-NEXT:    s_mov_b32 s16, s4
483; SI-NEXT:    s_mov_b32 s17, s5
484; SI-NEXT:    s_mov_b32 s18, s10
485; SI-NEXT:    s_mov_b32 s19, s11
486; SI-NEXT:    s_mov_b32 s4, s6
487; SI-NEXT:    s_mov_b32 s5, s7
488; SI-NEXT:    s_mov_b32 s6, s10
489; SI-NEXT:    s_mov_b32 s7, s11
490; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
491; SI-NEXT:    s_waitcnt vmcnt(0)
492; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
493; SI-NEXT:    s_waitcnt vmcnt(0)
494; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
495; SI-NEXT:    s_waitcnt vmcnt(0)
496; SI-NEXT:    s_mov_b32 s8, s0
497; SI-NEXT:    s_mov_b32 s9, s1
498; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
499; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
500; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
501; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
502; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
503; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
504; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
505; SI-NEXT:    s_endpgm
506;
507; VI-LABEL: select_f16_imm_d:
508; VI:       ; %bb.0: ; %entry
509; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
510; VI-NEXT:    s_mov_b32 s11, 0xf000
511; VI-NEXT:    s_mov_b32 s10, -1
512; VI-NEXT:    s_mov_b32 s14, s10
513; VI-NEXT:    s_mov_b32 s15, s11
514; VI-NEXT:    s_waitcnt lgkmcnt(0)
515; VI-NEXT:    s_mov_b32 s12, s2
516; VI-NEXT:    s_mov_b32 s13, s3
517; VI-NEXT:    s_mov_b32 s16, s4
518; VI-NEXT:    s_mov_b32 s17, s5
519; VI-NEXT:    s_mov_b32 s18, s10
520; VI-NEXT:    s_mov_b32 s19, s11
521; VI-NEXT:    s_mov_b32 s4, s6
522; VI-NEXT:    s_mov_b32 s5, s7
523; VI-NEXT:    s_mov_b32 s6, s10
524; VI-NEXT:    s_mov_b32 s7, s11
525; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
526; VI-NEXT:    s_waitcnt vmcnt(0)
527; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
528; VI-NEXT:    s_waitcnt vmcnt(0)
529; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
530; VI-NEXT:    s_waitcnt vmcnt(0)
531; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
532; VI-NEXT:    s_mov_b32 s8, s0
533; VI-NEXT:    s_mov_b32 s9, s1
534; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
535; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
536; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
537; VI-NEXT:    s_endpgm
538;
539; GFX11-LABEL: select_f16_imm_d:
540; GFX11:       ; %bb.0: ; %entry
541; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
542; GFX11-NEXT:    s_mov_b32 s10, -1
543; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
544; GFX11-NEXT:    s_mov_b32 s14, s10
545; GFX11-NEXT:    s_mov_b32 s15, s11
546; GFX11-NEXT:    s_mov_b32 s18, s10
547; GFX11-NEXT:    s_mov_b32 s19, s11
548; GFX11-NEXT:    s_mov_b32 s22, s10
549; GFX11-NEXT:    s_mov_b32 s23, s11
550; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX11-NEXT:    s_mov_b32 s12, s2
552; GFX11-NEXT:    s_mov_b32 s13, s3
553; GFX11-NEXT:    s_mov_b32 s16, s4
554; GFX11-NEXT:    s_mov_b32 s17, s5
555; GFX11-NEXT:    s_mov_b32 s20, s6
556; GFX11-NEXT:    s_mov_b32 s21, s7
557; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
558; GFX11-NEXT:    s_waitcnt vmcnt(0)
559; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
560; GFX11-NEXT:    s_waitcnt vmcnt(0)
561; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
562; GFX11-NEXT:    s_waitcnt vmcnt(0)
563; GFX11-NEXT:    s_mov_b32 s8, s0
564; GFX11-NEXT:    s_mov_b32 s9, s1
565; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
566; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
567; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
568; GFX11-NEXT:    s_endpgm
569    ptr addrspace(1) %r,
570    ptr addrspace(1) %a,
571    ptr addrspace(1) %b,
572    ptr addrspace(1) %c) {
573entry:
574  %a.val = load volatile half, ptr addrspace(1) %a
575  %b.val = load volatile half, ptr addrspace(1) %b
576  %c.val = load volatile half, ptr addrspace(1) %c
577  %fcmp = fcmp olt half %a.val, %b.val
578  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
579  store half %r.val, ptr addrspace(1) %r
580  ret void
581}
582
583define amdgpu_kernel void @select_v2f16(
584; SI-LABEL: select_v2f16:
585; SI:       ; %bb.0: ; %entry
586; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
587; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
588; SI-NEXT:    s_mov_b32 s3, 0xf000
589; SI-NEXT:    s_mov_b32 s2, -1
590; SI-NEXT:    s_mov_b32 s18, s2
591; SI-NEXT:    s_waitcnt lgkmcnt(0)
592; SI-NEXT:    s_mov_b32 s16, s10
593; SI-NEXT:    s_mov_b32 s17, s11
594; SI-NEXT:    s_mov_b32 s19, s3
595; SI-NEXT:    s_mov_b32 s20, s12
596; SI-NEXT:    s_mov_b32 s21, s13
597; SI-NEXT:    s_mov_b32 s22, s2
598; SI-NEXT:    s_mov_b32 s23, s3
599; SI-NEXT:    s_mov_b32 s6, s2
600; SI-NEXT:    s_mov_b32 s7, s3
601; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
602; SI-NEXT:    s_mov_b32 s12, s14
603; SI-NEXT:    s_mov_b32 s13, s15
604; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
605; SI-NEXT:    s_mov_b32 s14, s2
606; SI-NEXT:    s_mov_b32 s15, s3
607; SI-NEXT:    buffer_load_dword v2, off, s[20:23], 0
608; SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0
609; SI-NEXT:    s_mov_b32 s0, s8
610; SI-NEXT:    s_mov_b32 s1, s9
611; SI-NEXT:    s_waitcnt vmcnt(3)
612; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
613; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
614; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
615; SI-NEXT:    s_waitcnt vmcnt(2)
616; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
617; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
618; SI-NEXT:    s_waitcnt vmcnt(1)
619; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
620; SI-NEXT:    s_waitcnt vmcnt(0)
621; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
622; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
623; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
624; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
625; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
626; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
627; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
628; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
629; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
630; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
631; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
632; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
633; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
634; SI-NEXT:    v_or_b32_e32 v0, v1, v0
635; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
636; SI-NEXT:    s_endpgm
637;
638; VI-LABEL: select_v2f16:
639; VI:       ; %bb.0: ; %entry
640; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
641; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x44
642; VI-NEXT:    s_mov_b32 s3, 0xf000
643; VI-NEXT:    s_mov_b32 s2, -1
644; VI-NEXT:    s_mov_b32 s6, s2
645; VI-NEXT:    s_mov_b32 s7, s3
646; VI-NEXT:    s_waitcnt lgkmcnt(0)
647; VI-NEXT:    s_mov_b32 s16, s10
648; VI-NEXT:    s_mov_b32 s17, s11
649; VI-NEXT:    s_mov_b32 s18, s2
650; VI-NEXT:    s_mov_b32 s19, s3
651; VI-NEXT:    s_mov_b32 s20, s12
652; VI-NEXT:    s_mov_b32 s21, s13
653; VI-NEXT:    s_mov_b32 s22, s2
654; VI-NEXT:    s_mov_b32 s23, s3
655; VI-NEXT:    s_mov_b32 s12, s14
656; VI-NEXT:    s_mov_b32 s13, s15
657; VI-NEXT:    s_mov_b32 s14, s2
658; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
659; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
660; VI-NEXT:    buffer_load_dword v2, off, s[16:19], 0
661; VI-NEXT:    s_mov_b32 s15, s3
662; VI-NEXT:    buffer_load_dword v3, off, s[12:15], 0
663; VI-NEXT:    s_mov_b32 s0, s8
664; VI-NEXT:    s_mov_b32 s1, s9
665; VI-NEXT:    s_waitcnt vmcnt(3)
666; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
667; VI-NEXT:    s_waitcnt vmcnt(2)
668; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
669; VI-NEXT:    s_waitcnt vmcnt(1)
670; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
671; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v2, v1
672; VI-NEXT:    s_waitcnt vmcnt(0)
673; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
674; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
675; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
676; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
677; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
678; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
679; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
680; VI-NEXT:    s_endpgm
681;
682; GFX11-LABEL: select_v2f16:
683; GFX11:       ; %bb.0: ; %entry
684; GFX11-NEXT:    s_clause 0x1
685; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
686; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x44
687; GFX11-NEXT:    s_mov_b32 s2, -1
688; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
689; GFX11-NEXT:    s_mov_b32 s6, s2
690; GFX11-NEXT:    s_mov_b32 s7, s3
691; GFX11-NEXT:    s_mov_b32 s22, s2
692; GFX11-NEXT:    s_mov_b32 s23, s3
693; GFX11-NEXT:    s_mov_b32 s18, s2
694; GFX11-NEXT:    s_mov_b32 s19, s3
695; GFX11-NEXT:    s_mov_b32 s26, s2
696; GFX11-NEXT:    s_mov_b32 s27, s3
697; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX11-NEXT:    s_mov_b32 s20, s12
699; GFX11-NEXT:    s_mov_b32 s21, s13
700; GFX11-NEXT:    s_mov_b32 s16, s10
701; GFX11-NEXT:    s_mov_b32 s17, s11
702; GFX11-NEXT:    s_mov_b32 s24, s14
703; GFX11-NEXT:    s_mov_b32 s25, s15
704; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
705; GFX11-NEXT:    buffer_load_b32 v1, off, s[20:23], 0
706; GFX11-NEXT:    buffer_load_b32 v2, off, s[16:19], 0
707; GFX11-NEXT:    buffer_load_b32 v3, off, s[24:27], 0
708; GFX11-NEXT:    s_mov_b32 s0, s8
709; GFX11-NEXT:    s_mov_b32 s1, s9
710; GFX11-NEXT:    s_waitcnt vmcnt(3)
711; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
712; GFX11-NEXT:    s_waitcnt vmcnt(2)
713; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
714; GFX11-NEXT:    s_waitcnt vmcnt(1)
715; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
716; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v2, v1
717; GFX11-NEXT:    s_waitcnt vmcnt(0)
718; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
719; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
720; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v6, v5
721; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
722; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
723; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
724; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
725; GFX11-NEXT:    s_endpgm
726    ptr addrspace(1) %r,
727    ptr addrspace(1) %a,
728    ptr addrspace(1) %b,
729    ptr addrspace(1) %c,
730    ptr addrspace(1) %d) {
731entry:
732  %a.val = load <2 x half>, ptr addrspace(1) %a
733  %b.val = load <2 x half>, ptr addrspace(1) %b
734  %c.val = load <2 x half>, ptr addrspace(1) %c
735  %d.val = load <2 x half>, ptr addrspace(1) %d
736  %fcmp = fcmp olt <2 x half> %a.val, %b.val
737  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
738  store <2 x half> %r.val, ptr addrspace(1) %r
739  ret void
740}
741
742define amdgpu_kernel void @select_v2f16_imm_a(
743; SI-LABEL: select_v2f16_imm_a:
744; SI:       ; %bb.0: ; %entry
745; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
746; SI-NEXT:    s_mov_b32 s11, 0xf000
747; SI-NEXT:    s_mov_b32 s10, -1
748; SI-NEXT:    s_mov_b32 s14, s10
749; SI-NEXT:    s_mov_b32 s15, s11
750; SI-NEXT:    s_waitcnt lgkmcnt(0)
751; SI-NEXT:    s_mov_b32 s12, s2
752; SI-NEXT:    s_mov_b32 s13, s3
753; SI-NEXT:    s_mov_b32 s16, s4
754; SI-NEXT:    s_mov_b32 s17, s5
755; SI-NEXT:    s_mov_b32 s18, s10
756; SI-NEXT:    s_mov_b32 s19, s11
757; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
758; SI-NEXT:    s_mov_b32 s4, s6
759; SI-NEXT:    s_mov_b32 s5, s7
760; SI-NEXT:    s_mov_b32 s6, s10
761; SI-NEXT:    s_mov_b32 s7, s11
762; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
763; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
764; SI-NEXT:    s_mov_b32 s2, 0x3f200000
765; SI-NEXT:    s_mov_b32 s8, s0
766; SI-NEXT:    s_mov_b32 s9, s1
767; SI-NEXT:    s_waitcnt vmcnt(2)
768; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
769; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
770; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
771; SI-NEXT:    s_waitcnt vmcnt(1)
772; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
773; SI-NEXT:    s_waitcnt vmcnt(0)
774; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
775; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
776; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
777; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
778; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
779; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
780; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
781; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
782; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
783; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
784; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
785; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
786; SI-NEXT:    v_or_b32_e32 v0, v0, v1
787; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
788; SI-NEXT:    s_endpgm
789;
790; VI-LABEL: select_v2f16_imm_a:
791; VI:       ; %bb.0: ; %entry
792; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
793; VI-NEXT:    s_mov_b32 s11, 0xf000
794; VI-NEXT:    s_mov_b32 s10, -1
795; VI-NEXT:    s_mov_b32 s14, s10
796; VI-NEXT:    s_mov_b32 s15, s11
797; VI-NEXT:    s_waitcnt lgkmcnt(0)
798; VI-NEXT:    s_mov_b32 s12, s2
799; VI-NEXT:    s_mov_b32 s13, s3
800; VI-NEXT:    s_mov_b32 s16, s4
801; VI-NEXT:    s_mov_b32 s17, s5
802; VI-NEXT:    s_mov_b32 s18, s10
803; VI-NEXT:    s_mov_b32 s19, s11
804; VI-NEXT:    s_mov_b32 s4, s6
805; VI-NEXT:    s_mov_b32 s5, s7
806; VI-NEXT:    s_mov_b32 s6, s10
807; VI-NEXT:    s_mov_b32 s7, s11
808; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
809; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
810; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
811; VI-NEXT:    s_movk_i32 s2, 0x3900
812; VI-NEXT:    s_mov_b32 s8, s0
813; VI-NEXT:    s_mov_b32 s9, s1
814; VI-NEXT:    s_waitcnt vmcnt(2)
815; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
816; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
817; VI-NEXT:    s_waitcnt vmcnt(0)
818; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
819; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
820; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
821; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
822; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
823; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
824; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
825; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
826; VI-NEXT:    s_endpgm
827;
828; GFX11-LABEL: select_v2f16_imm_a:
829; GFX11:       ; %bb.0: ; %entry
830; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
831; GFX11-NEXT:    s_mov_b32 s10, -1
832; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
833; GFX11-NEXT:    s_mov_b32 s14, s10
834; GFX11-NEXT:    s_mov_b32 s15, s11
835; GFX11-NEXT:    s_mov_b32 s18, s10
836; GFX11-NEXT:    s_mov_b32 s19, s11
837; GFX11-NEXT:    s_mov_b32 s22, s10
838; GFX11-NEXT:    s_mov_b32 s23, s11
839; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
840; GFX11-NEXT:    s_mov_b32 s12, s2
841; GFX11-NEXT:    s_mov_b32 s13, s3
842; GFX11-NEXT:    s_mov_b32 s16, s4
843; GFX11-NEXT:    s_mov_b32 s17, s5
844; GFX11-NEXT:    s_mov_b32 s20, s6
845; GFX11-NEXT:    s_mov_b32 s21, s7
846; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
847; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
848; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
849; GFX11-NEXT:    s_mov_b32 s8, s0
850; GFX11-NEXT:    s_mov_b32 s9, s1
851; GFX11-NEXT:    s_waitcnt vmcnt(2)
852; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
853; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
854; GFX11-NEXT:    s_waitcnt vmcnt(1)
855; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
856; GFX11-NEXT:    s_waitcnt vmcnt(0)
857; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
858; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
859; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
860; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
861; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
862; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
863; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
864; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
865; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
866; GFX11-NEXT:    s_endpgm
867    ptr addrspace(1) %r,
868    ptr addrspace(1) %b,
869    ptr addrspace(1) %c,
870    ptr addrspace(1) %d) {
871entry:
872  %b.val = load <2 x half>, ptr addrspace(1) %b
873  %c.val = load <2 x half>, ptr addrspace(1) %c
874  %d.val = load <2 x half>, ptr addrspace(1) %d
875  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
876  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
877  store <2 x half> %r.val, ptr addrspace(1) %r
878  ret void
879}
880
881define amdgpu_kernel void @select_v2f16_imm_b(
882; SI-LABEL: select_v2f16_imm_b:
883; SI:       ; %bb.0: ; %entry
884; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
885; SI-NEXT:    s_mov_b32 s11, 0xf000
886; SI-NEXT:    s_mov_b32 s10, -1
887; SI-NEXT:    s_mov_b32 s14, s10
888; SI-NEXT:    s_mov_b32 s15, s11
889; SI-NEXT:    s_waitcnt lgkmcnt(0)
890; SI-NEXT:    s_mov_b32 s12, s2
891; SI-NEXT:    s_mov_b32 s13, s3
892; SI-NEXT:    s_mov_b32 s16, s4
893; SI-NEXT:    s_mov_b32 s17, s5
894; SI-NEXT:    s_mov_b32 s18, s10
895; SI-NEXT:    s_mov_b32 s19, s11
896; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
897; SI-NEXT:    s_mov_b32 s4, s6
898; SI-NEXT:    s_mov_b32 s5, s7
899; SI-NEXT:    s_mov_b32 s6, s10
900; SI-NEXT:    s_mov_b32 s7, s11
901; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
902; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
903; SI-NEXT:    s_mov_b32 s2, 0x3f200000
904; SI-NEXT:    s_mov_b32 s8, s0
905; SI-NEXT:    s_mov_b32 s9, s1
906; SI-NEXT:    s_waitcnt vmcnt(2)
907; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
908; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
909; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
910; SI-NEXT:    s_waitcnt vmcnt(1)
911; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
912; SI-NEXT:    s_waitcnt vmcnt(0)
913; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
914; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
915; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
916; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
917; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
918; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
919; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
920; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
921; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
922; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
923; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
924; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
925; SI-NEXT:    v_or_b32_e32 v0, v0, v1
926; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
927; SI-NEXT:    s_endpgm
928;
929; VI-LABEL: select_v2f16_imm_b:
930; VI:       ; %bb.0: ; %entry
931; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
932; VI-NEXT:    s_mov_b32 s11, 0xf000
933; VI-NEXT:    s_mov_b32 s10, -1
934; VI-NEXT:    s_mov_b32 s14, s10
935; VI-NEXT:    s_mov_b32 s15, s11
936; VI-NEXT:    s_waitcnt lgkmcnt(0)
937; VI-NEXT:    s_mov_b32 s12, s2
938; VI-NEXT:    s_mov_b32 s13, s3
939; VI-NEXT:    s_mov_b32 s16, s4
940; VI-NEXT:    s_mov_b32 s17, s5
941; VI-NEXT:    s_mov_b32 s18, s10
942; VI-NEXT:    s_mov_b32 s19, s11
943; VI-NEXT:    s_mov_b32 s4, s6
944; VI-NEXT:    s_mov_b32 s5, s7
945; VI-NEXT:    s_mov_b32 s6, s10
946; VI-NEXT:    s_mov_b32 s7, s11
947; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
948; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
949; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
950; VI-NEXT:    s_movk_i32 s2, 0x3900
951; VI-NEXT:    s_mov_b32 s8, s0
952; VI-NEXT:    s_mov_b32 s9, s1
953; VI-NEXT:    s_waitcnt vmcnt(2)
954; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
955; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
956; VI-NEXT:    s_waitcnt vmcnt(0)
957; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
958; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
959; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
960; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
961; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
962; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
963; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
964; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
965; VI-NEXT:    s_endpgm
966;
967; GFX11-LABEL: select_v2f16_imm_b:
968; GFX11:       ; %bb.0: ; %entry
969; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
970; GFX11-NEXT:    s_mov_b32 s10, -1
971; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
972; GFX11-NEXT:    s_mov_b32 s14, s10
973; GFX11-NEXT:    s_mov_b32 s15, s11
974; GFX11-NEXT:    s_mov_b32 s18, s10
975; GFX11-NEXT:    s_mov_b32 s19, s11
976; GFX11-NEXT:    s_mov_b32 s22, s10
977; GFX11-NEXT:    s_mov_b32 s23, s11
978; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX11-NEXT:    s_mov_b32 s12, s2
980; GFX11-NEXT:    s_mov_b32 s13, s3
981; GFX11-NEXT:    s_mov_b32 s16, s4
982; GFX11-NEXT:    s_mov_b32 s17, s5
983; GFX11-NEXT:    s_mov_b32 s20, s6
984; GFX11-NEXT:    s_mov_b32 s21, s7
985; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
986; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
987; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
988; GFX11-NEXT:    s_mov_b32 s8, s0
989; GFX11-NEXT:    s_mov_b32 s9, s1
990; GFX11-NEXT:    s_waitcnt vmcnt(2)
991; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
992; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
993; GFX11-NEXT:    s_waitcnt vmcnt(1)
994; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
995; GFX11-NEXT:    s_waitcnt vmcnt(0)
996; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
997; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
998; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
999; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1000; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
1001; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1003; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1004; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1005; GFX11-NEXT:    s_endpgm
1006    ptr addrspace(1) %r,
1007    ptr addrspace(1) %a,
1008    ptr addrspace(1) %c,
1009    ptr addrspace(1) %d) {
1010entry:
1011  %a.val = load <2 x half>, ptr addrspace(1) %a
1012  %c.val = load <2 x half>, ptr addrspace(1) %c
1013  %d.val = load <2 x half>, ptr addrspace(1) %d
1014  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
1015  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
1016  store <2 x half> %r.val, ptr addrspace(1) %r
1017  ret void
1018}
1019
1020define amdgpu_kernel void @select_v2f16_imm_c(
1021; SI-LABEL: select_v2f16_imm_c:
1022; SI:       ; %bb.0: ; %entry
1023; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1024; SI-NEXT:    s_mov_b32 s11, 0xf000
1025; SI-NEXT:    s_mov_b32 s10, -1
1026; SI-NEXT:    s_mov_b32 s14, s10
1027; SI-NEXT:    s_mov_b32 s15, s11
1028; SI-NEXT:    s_waitcnt lgkmcnt(0)
1029; SI-NEXT:    s_mov_b32 s12, s2
1030; SI-NEXT:    s_mov_b32 s13, s3
1031; SI-NEXT:    s_mov_b32 s16, s4
1032; SI-NEXT:    s_mov_b32 s17, s5
1033; SI-NEXT:    s_mov_b32 s18, s10
1034; SI-NEXT:    s_mov_b32 s19, s11
1035; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1036; SI-NEXT:    s_mov_b32 s4, s6
1037; SI-NEXT:    s_mov_b32 s5, s7
1038; SI-NEXT:    s_mov_b32 s6, s10
1039; SI-NEXT:    s_mov_b32 s7, s11
1040; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
1041; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
1042; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
1043; SI-NEXT:    s_mov_b32 s8, s0
1044; SI-NEXT:    s_mov_b32 s9, s1
1045; SI-NEXT:    s_waitcnt vmcnt(2)
1046; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1047; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1048; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1049; SI-NEXT:    s_waitcnt vmcnt(1)
1050; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1051; SI-NEXT:    s_waitcnt vmcnt(0)
1052; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1053; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1054; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1055; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1056; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1057; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
1058; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
1059; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
1060; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1061; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v2, vcc
1062; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1063; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1064; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1065; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1066; SI-NEXT:    s_endpgm
1067;
1068; VI-LABEL: select_v2f16_imm_c:
1069; VI:       ; %bb.0: ; %entry
1070; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1071; VI-NEXT:    s_mov_b32 s11, 0xf000
1072; VI-NEXT:    s_mov_b32 s10, -1
1073; VI-NEXT:    s_mov_b32 s18, s10
1074; VI-NEXT:    s_mov_b32 s19, s11
1075; VI-NEXT:    s_waitcnt lgkmcnt(0)
1076; VI-NEXT:    s_mov_b32 s16, s4
1077; VI-NEXT:    s_mov_b32 s17, s5
1078; VI-NEXT:    s_mov_b32 s14, s10
1079; VI-NEXT:    s_mov_b32 s12, s2
1080; VI-NEXT:    s_mov_b32 s13, s3
1081; VI-NEXT:    s_mov_b32 s15, s11
1082; VI-NEXT:    s_mov_b32 s4, s6
1083; VI-NEXT:    s_mov_b32 s5, s7
1084; VI-NEXT:    s_mov_b32 s6, s10
1085; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
1086; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
1087; VI-NEXT:    s_mov_b32 s7, s11
1088; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
1089; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
1090; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
1091; VI-NEXT:    s_mov_b32 s8, s0
1092; VI-NEXT:    s_mov_b32 s9, s1
1093; VI-NEXT:    s_waitcnt vmcnt(2)
1094; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1095; VI-NEXT:    s_waitcnt vmcnt(1)
1096; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
1097; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v0
1098; VI-NEXT:    s_waitcnt vmcnt(0)
1099; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
1100; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1101; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
1102; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
1103; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1104; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1105; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1106; VI-NEXT:    s_endpgm
1107;
1108; GFX11-LABEL: select_v2f16_imm_c:
1109; GFX11:       ; %bb.0: ; %entry
1110; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1111; GFX11-NEXT:    s_mov_b32 s10, -1
1112; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1113; GFX11-NEXT:    s_mov_b32 s18, s10
1114; GFX11-NEXT:    s_mov_b32 s19, s11
1115; GFX11-NEXT:    s_mov_b32 s14, s10
1116; GFX11-NEXT:    s_mov_b32 s15, s11
1117; GFX11-NEXT:    s_mov_b32 s22, s10
1118; GFX11-NEXT:    s_mov_b32 s23, s11
1119; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1120; GFX11-NEXT:    s_mov_b32 s16, s4
1121; GFX11-NEXT:    s_mov_b32 s17, s5
1122; GFX11-NEXT:    s_mov_b32 s12, s2
1123; GFX11-NEXT:    s_mov_b32 s13, s3
1124; GFX11-NEXT:    s_mov_b32 s20, s6
1125; GFX11-NEXT:    s_mov_b32 s21, s7
1126; GFX11-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
1127; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
1128; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
1129; GFX11-NEXT:    s_mov_b32 s8, s0
1130; GFX11-NEXT:    s_mov_b32 s9, s1
1131; GFX11-NEXT:    s_waitcnt vmcnt(2)
1132; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1133; GFX11-NEXT:    s_waitcnt vmcnt(1)
1134; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1135; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
1136; GFX11-NEXT:    s_waitcnt vmcnt(0)
1137; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1138; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
1139; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v4, v3
1140; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
1141; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1142; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
1143; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1144; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1145; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1146; GFX11-NEXT:    s_endpgm
1147    ptr addrspace(1) %r,
1148    ptr addrspace(1) %a,
1149    ptr addrspace(1) %b,
1150    ptr addrspace(1) %d) {
1151entry:
1152  %a.val = load <2 x half>, ptr addrspace(1) %a
1153  %b.val = load <2 x half>, ptr addrspace(1) %b
1154  %d.val = load <2 x half>, ptr addrspace(1) %d
1155  %fcmp = fcmp olt <2 x half> %a.val, %b.val
1156  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
1157  store <2 x half> %r.val, ptr addrspace(1) %r
1158  ret void
1159}
1160
1161define amdgpu_kernel void @select_v2f16_imm_d(
1162; SI-LABEL: select_v2f16_imm_d:
1163; SI:       ; %bb.0: ; %entry
1164; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1165; SI-NEXT:    s_mov_b32 s11, 0xf000
1166; SI-NEXT:    s_mov_b32 s10, -1
1167; SI-NEXT:    s_mov_b32 s14, s10
1168; SI-NEXT:    s_mov_b32 s15, s11
1169; SI-NEXT:    s_waitcnt lgkmcnt(0)
1170; SI-NEXT:    s_mov_b32 s12, s2
1171; SI-NEXT:    s_mov_b32 s13, s3
1172; SI-NEXT:    s_mov_b32 s16, s4
1173; SI-NEXT:    s_mov_b32 s17, s5
1174; SI-NEXT:    s_mov_b32 s18, s10
1175; SI-NEXT:    s_mov_b32 s19, s11
1176; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1177; SI-NEXT:    s_mov_b32 s4, s6
1178; SI-NEXT:    s_mov_b32 s5, s7
1179; SI-NEXT:    s_mov_b32 s6, s10
1180; SI-NEXT:    s_mov_b32 s7, s11
1181; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
1182; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
1183; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
1184; SI-NEXT:    s_mov_b32 s8, s0
1185; SI-NEXT:    s_mov_b32 s9, s1
1186; SI-NEXT:    s_waitcnt vmcnt(2)
1187; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
1188; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1189; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1190; SI-NEXT:    s_waitcnt vmcnt(1)
1191; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1192; SI-NEXT:    s_waitcnt vmcnt(0)
1193; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1194; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1195; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1196; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1197; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1198; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
1199; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1200; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
1201; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1202; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
1203; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1204; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1205; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1206; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1207; SI-NEXT:    s_endpgm
1208;
1209; VI-LABEL: select_v2f16_imm_d:
1210; VI:       ; %bb.0: ; %entry
1211; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1212; VI-NEXT:    s_mov_b32 s11, 0xf000
1213; VI-NEXT:    s_mov_b32 s10, -1
1214; VI-NEXT:    s_mov_b32 s18, s10
1215; VI-NEXT:    s_mov_b32 s19, s11
1216; VI-NEXT:    s_waitcnt lgkmcnt(0)
1217; VI-NEXT:    s_mov_b32 s16, s4
1218; VI-NEXT:    s_mov_b32 s17, s5
1219; VI-NEXT:    s_mov_b32 s14, s10
1220; VI-NEXT:    s_mov_b32 s12, s2
1221; VI-NEXT:    s_mov_b32 s13, s3
1222; VI-NEXT:    s_mov_b32 s15, s11
1223; VI-NEXT:    s_mov_b32 s4, s6
1224; VI-NEXT:    s_mov_b32 s5, s7
1225; VI-NEXT:    s_mov_b32 s6, s10
1226; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
1227; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
1228; VI-NEXT:    s_mov_b32 s7, s11
1229; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
1230; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
1231; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
1232; VI-NEXT:    s_mov_b32 s8, s0
1233; VI-NEXT:    s_mov_b32 s9, s1
1234; VI-NEXT:    s_waitcnt vmcnt(2)
1235; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1236; VI-NEXT:    s_waitcnt vmcnt(1)
1237; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
1238; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v1, v0
1239; VI-NEXT:    s_waitcnt vmcnt(0)
1240; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
1241; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1242; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
1243; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
1244; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1245; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1246; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1247; VI-NEXT:    s_endpgm
1248;
1249; GFX11-LABEL: select_v2f16_imm_d:
1250; GFX11:       ; %bb.0: ; %entry
1251; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1252; GFX11-NEXT:    s_mov_b32 s10, -1
1253; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1254; GFX11-NEXT:    s_mov_b32 s18, s10
1255; GFX11-NEXT:    s_mov_b32 s19, s11
1256; GFX11-NEXT:    s_mov_b32 s14, s10
1257; GFX11-NEXT:    s_mov_b32 s15, s11
1258; GFX11-NEXT:    s_mov_b32 s22, s10
1259; GFX11-NEXT:    s_mov_b32 s23, s11
1260; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX11-NEXT:    s_mov_b32 s16, s4
1262; GFX11-NEXT:    s_mov_b32 s17, s5
1263; GFX11-NEXT:    s_mov_b32 s12, s2
1264; GFX11-NEXT:    s_mov_b32 s13, s3
1265; GFX11-NEXT:    s_mov_b32 s20, s6
1266; GFX11-NEXT:    s_mov_b32 s21, s7
1267; GFX11-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
1268; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
1269; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
1270; GFX11-NEXT:    s_mov_b32 s8, s0
1271; GFX11-NEXT:    s_mov_b32 s9, s1
1272; GFX11-NEXT:    s_waitcnt vmcnt(2)
1273; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1274; GFX11-NEXT:    s_waitcnt vmcnt(1)
1275; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1276; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
1277; GFX11-NEXT:    s_waitcnt vmcnt(0)
1278; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1279; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
1280; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v4, v3
1281; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
1282; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1283; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
1284; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1285; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1286; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1287; GFX11-NEXT:    s_endpgm
1288    ptr addrspace(1) %r,
1289    ptr addrspace(1) %a,
1290    ptr addrspace(1) %b,
1291    ptr addrspace(1) %c) {
1292entry:
1293  %a.val = load <2 x half>, ptr addrspace(1) %a
1294  %b.val = load <2 x half>, ptr addrspace(1) %b
1295  %c.val = load <2 x half>, ptr addrspace(1) %c
1296  %fcmp = fcmp olt <2 x half> %a.val, %b.val
1297  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
1298  store <2 x half> %r.val, ptr addrspace(1) %r
1299  ret void
1300}
1301
1302define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) {
1303; SI-LABEL: v_select_v4f16:
1304; SI:       ; %bb.0:
1305; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1307; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1308; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1309; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1310; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1311; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1312; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1313; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1314; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1315; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1316; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1317; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
1318; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1319; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
1320; SI-NEXT:    v_or_b32_e32 v3, v6, v3
1321; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1322; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
1323; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
1324; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
1325; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1326; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
1327; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1328; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1329; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1330; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1331; SI-NEXT:    s_setpc_b64 s[30:31]
1332;
1333; VI-LABEL: v_select_v4f16:
1334; VI:       ; %bb.0:
1335; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1337; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
1338; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
1339; VI-NEXT:    s_setpc_b64 s[30:31]
1340;
1341; GFX11-LABEL: v_select_v4f16:
1342; GFX11:       ; %bb.0:
1343; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
1345; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
1346; GFX11-NEXT:    s_setpc_b64 s[30:31]
1347  %cmp = icmp eq i32 %cond, 0
1348  %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
1349  ret <4 x half> %select
1350}
1351
1352define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond) {
1353; SI-LABEL: v_vselect_v4f16:
1354; SI:       ; %bb.0:
1355; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1356; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1357; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1358; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1359; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1360; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1361; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1362; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1363; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1364; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1365; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1366; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1367; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1368; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1369; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1370; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1371; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1372; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
1373; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
1374; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
1375; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1376; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
1377; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
1378; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
1379; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
1380; SI-NEXT:    s_setpc_b64 s[30:31]
1381;
1382; VI-LABEL: v_vselect_v4f16:
1383; VI:       ; %bb.0:
1384; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1385; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1386; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
1387; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
1388; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
1389; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
1390; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1391; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
1392; VI-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc
1393; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
1394; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
1395; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1396; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
1397; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
1398; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1399; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
1400; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1401; VI-NEXT:    s_setpc_b64 s[30:31]
1402;
1403; GFX11-LABEL: v_vselect_v4f16:
1404; GFX11:       ; %bb.0:
1405; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1406; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1407; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
1408; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
1409; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1410; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
1411; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1412; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
1413; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
1414; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
1415; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
1416; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
1417; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
1418; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1419; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
1420; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
1421; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
1422; GFX11-NEXT:    s_setpc_b64 s[30:31]
1423  %cmp = icmp eq <4 x i32> %cond, zeroinitializer
1424  %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
1425  ret <4 x half> %select
1426}
1427
1428define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) {
1429; SI-LABEL: v_select_v8f16:
1430; SI:       ; %bb.0:
1431; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1432; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1433; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1434; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1435; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1436; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1437; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1438; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1439; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1440; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1441; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1442; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1443; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1444; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1445; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1446; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1447; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1448; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1449; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1450; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1451; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1452; SI-NEXT:    v_or_b32_e32 v6, v6, v7
1453; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
1454; SI-NEXT:    v_or_b32_e32 v4, v4, v5
1455; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
1456; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1457; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
1458; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1459; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
1460; SI-NEXT:    v_or_b32_e32 v7, v14, v7
1461; SI-NEXT:    v_or_b32_e32 v5, v12, v5
1462; SI-NEXT:    v_or_b32_e32 v3, v10, v3
1463; SI-NEXT:    v_or_b32_e32 v1, v8, v1
1464; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1465; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
1466; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
1467; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1468; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
1469; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1470; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
1471; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
1472; SI-NEXT:    v_cvt_f32_f16_e32 v6, v7
1473; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1474; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1475; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1476; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1477; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1478; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1479; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1480; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1481; SI-NEXT:    s_setpc_b64 s[30:31]
1482;
1483; VI-LABEL: v_select_v8f16:
1484; VI:       ; %bb.0:
1485; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1486; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
1487; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
1488; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1489; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
1490; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
1491; VI-NEXT:    s_setpc_b64 s[30:31]
1492;
1493; GFX11-LABEL: v_select_v8f16:
1494; GFX11:       ; %bb.0:
1495; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1496; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
1497; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
1498; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
1499; GFX11-NEXT:    s_setpc_b64 s[30:31]
1500  %cmp = icmp eq i32 %cond, 0
1501  %select = select i1 %cmp, <8 x half> %a, <8 x half> %b
1502  ret <8 x half> %select
1503}
1504
1505define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond) {
1506; SI-LABEL: v_vselect_v8f16:
1507; SI:       ; %bb.0:
1508; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1509; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1510; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1511; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1512; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1513; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1514; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1515; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1516; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1517; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1518; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
1519; SI-NEXT:    v_cvt_f16_f32_e32 v8, v9
1520; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1521; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1522; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1523; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1524; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1525; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1526; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1527; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
1528; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1529; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
1530; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1531; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
1532; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
1533; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1534; SI-NEXT:    v_cvt_f16_f32_e32 v8, v15
1535; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1536; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
1537; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1538; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
1539; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1540; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
1541; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
1542; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1543; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
1544; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
1545; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
1546; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1547; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1548; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
1549; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
1550; SI-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
1551; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
1552; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
1553; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
1554; SI-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
1555; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
1556; SI-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
1557; SI-NEXT:    s_setpc_b64 s[30:31]
1558;
1559; VI-LABEL: v_vselect_v8f16:
1560; VI:       ; %bb.0:
1561; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1562; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
1563; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
1564; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
1565; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v16, vcc
1566; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
1567; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
1568; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
1569; VI-NEXT:    v_cndmask_b32_e32 v13, v17, v16, vcc
1570; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
1571; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
1572; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
1573; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc
1574; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
1575; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
1576; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
1577; VI-NEXT:    v_cndmask_b32_e32 v9, v17, v16, vcc
1578; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
1579; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
1580; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
1581; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
1582; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
1583; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1584; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
1585; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
1586; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
1587; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1588; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
1589; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1590; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
1591; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1592; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v15
1593; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1594; VI-NEXT:    s_setpc_b64 s[30:31]
1595;
1596; GFX11-LABEL: v_vselect_v8f16:
1597; GFX11:       ; %bb.0:
1598; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
1600; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
1601; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v15
1602; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
1603; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
1604; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
1605; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
1606; GFX11-NEXT:    v_cndmask_b32_e32 v15, v17, v16, vcc_lo
1607; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
1608; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
1609; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v13
1610; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1611; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v16, vcc_lo
1612; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
1613; GFX11-NEXT:    v_cndmask_b32_e32 v11, v19, v18, vcc_lo
1614; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
1615; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v20, vcc_lo
1616; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
1617; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
1618; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
1619; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
1620; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
1621; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
1622; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v10
1623; GFX11-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
1624; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
1625; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v14
1626; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1627; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
1628; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
1629; GFX11-NEXT:    v_perm_b32 v3, v15, v3, 0x5040100
1630; GFX11-NEXT:    s_setpc_b64 s[30:31]
1631  %cmp = icmp eq <8 x i32> %cond, zeroinitializer
1632  %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
1633  ret <8 x half> %select
1634}
1635
1636define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) {
1637; SI-LABEL: v_select_v16f16:
1638; SI:       ; %bb.0:
1639; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1640; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1641; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1642; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1643; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1644; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
1645; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1646; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
1647; SI-NEXT:    v_or_b32_e32 v12, v12, v13
1648; SI-NEXT:    v_cvt_f16_f32_e32 v13, v29
1649; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1650; SI-NEXT:    v_or_b32_e32 v14, v14, v15
1651; SI-NEXT:    v_cvt_f16_f32_e32 v15, v28
1652; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
1653; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
1654; SI-NEXT:    v_or_b32_e32 v10, v10, v11
1655; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32
1656; SI-NEXT:    v_or_b32_e32 v13, v15, v13
1657; SI-NEXT:    v_cvt_f16_f32_e32 v15, v27
1658; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
1659; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1660; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1661; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
1662; SI-NEXT:    v_or_b32_e32 v15, v26, v15
1663; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
1664; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1665; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1666; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1667; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1668; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1669; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1670; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1671; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
1672; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1673; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1674; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1675; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1676; SI-NEXT:    v_or_b32_e32 v8, v8, v9
1677; SI-NEXT:    v_cvt_f16_f32_e32 v9, v25
1678; SI-NEXT:    v_or_b32_e32 v6, v6, v7
1679; SI-NEXT:    v_cvt_f16_f32_e32 v7, v23
1680; SI-NEXT:    v_or_b32_e32 v4, v4, v5
1681; SI-NEXT:    v_cvt_f16_f32_e32 v5, v21
1682; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
1683; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1684; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
1685; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
1686; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
1687; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
1688; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
1689; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
1690; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1691; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
1692; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
1693; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1694; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1695; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
1696; SI-NEXT:    v_or_b32_e32 v9, v24, v9
1697; SI-NEXT:    v_or_b32_e32 v7, v22, v7
1698; SI-NEXT:    v_or_b32_e32 v5, v20, v5
1699; SI-NEXT:    v_or_b32_e32 v1, v16, v1
1700; SI-NEXT:    s_waitcnt vmcnt(1)
1701; SI-NEXT:    v_cvt_f16_f32_e32 v3, v11
1702; SI-NEXT:    v_cvt_f16_f32_e32 v11, v30
1703; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1704; SI-NEXT:    v_or_b32_e32 v3, v11, v3
1705; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v19
1706; SI-NEXT:    v_or_b32_e32 v11, v18, v11
1707; SI-NEXT:    s_waitcnt vmcnt(0)
1708; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
1709; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
1710; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v2, vcc
1711; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1712; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
1713; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
1714; SI-NEXT:    v_cndmask_b32_e32 v15, v15, v10, vcc
1715; SI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
1716; SI-NEXT:    v_cndmask_b32_e32 v16, v3, v14, vcc
1717; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1718; SI-NEXT:    v_cvt_f32_f16_e32 v2, v11
1719; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
1720; SI-NEXT:    v_cvt_f32_f16_e32 v6, v7
1721; SI-NEXT:    v_cvt_f32_f16_e32 v8, v9
1722; SI-NEXT:    v_cvt_f32_f16_e32 v10, v15
1723; SI-NEXT:    v_cvt_f32_f16_e32 v12, v13
1724; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1725; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
1726; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1727; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1728; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
1729; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
1730; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
1731; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
1732; SI-NEXT:    v_cvt_f32_f16_e32 v14, v16
1733; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1734; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1735; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1736; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1737; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
1738; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
1739; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
1740; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
1741; SI-NEXT:    s_setpc_b64 s[30:31]
1742;
1743; VI-LABEL: v_select_v16f16:
1744; VI:       ; %bb.0:
1745; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1746; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1747; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
1748; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
1749; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
1750; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
1751; VI-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
1752; VI-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
1753; VI-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
1754; VI-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
1755; VI-NEXT:    s_setpc_b64 s[30:31]
1756;
1757; GFX11-LABEL: v_select_v16f16:
1758; GFX11:       ; %bb.0:
1759; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1760; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
1761; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
1762; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v2 :: v_dual_cndmask_b32 v3, v11, v3
1763; GFX11-NEXT:    v_dual_cndmask_b32 v4, v12, v4 :: v_dual_cndmask_b32 v5, v13, v5
1764; GFX11-NEXT:    v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7
1765; GFX11-NEXT:    s_setpc_b64 s[30:31]
1766  %cmp = icmp eq i32 %cond, 0
1767  %select = select i1 %cmp, <16 x half> %a, <16 x half> %b
1768  ret <16 x half> %select
1769}
1770
1771define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> %cond) {
1772; SI-LABEL: v_vselect_v16f16:
1773; SI:       ; %bb.0:
1774; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1775; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
1776; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
1777; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1778; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1779; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
1780; SI-NEXT:    v_cvt_f32_f16_e32 v37, v16
1781; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
1782; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
1783; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
1784; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
1785; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
1786; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
1787; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:32
1788; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1789; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1790; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
1791; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1792; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1793; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1794; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1795; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1796; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1797; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1798; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1799; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
1800; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1801; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
1802; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1803; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
1804; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1805; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1806; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
1807; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
1808; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
1809; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
1810; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
1811; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
1812; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
1813; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
1814; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
1815; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
1816; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
1817; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
1818; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
1819; SI-NEXT:    s_waitcnt vmcnt(7)
1820; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v36
1821; SI-NEXT:    v_cndmask_b32_e32 v0, v37, v0, vcc
1822; SI-NEXT:    s_waitcnt vmcnt(6)
1823; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v38
1824; SI-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
1825; SI-NEXT:    v_cvt_f16_f32_e32 v17, v18
1826; SI-NEXT:    s_waitcnt vmcnt(5)
1827; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v35
1828; SI-NEXT:    v_cvt_f16_f32_e32 v18, v20
1829; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
1830; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
1831; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
1832; SI-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
1833; SI-NEXT:    v_cvt_f16_f32_e32 v17, v19
1834; SI-NEXT:    s_waitcnt vmcnt(5)
1835; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
1836; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
1837; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44
1838; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
1839; SI-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
1840; SI-NEXT:    s_waitcnt vmcnt(6)
1841; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v33
1842; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
1843; SI-NEXT:    v_cvt_f16_f32_e32 v17, v21
1844; SI-NEXT:    v_cndmask_b32_e32 v4, v18, v4, vcc
1845; SI-NEXT:    v_cvt_f16_f32_e32 v18, v22
1846; SI-NEXT:    s_waitcnt vmcnt(6)
1847; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v34
1848; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
1849; SI-NEXT:    v_cvt_f16_f32_e32 v22, v23
1850; SI-NEXT:    v_cvt_f32_f16_e32 v21, v18
1851; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
1852; SI-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
1853; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52
1854; SI-NEXT:    s_waitcnt vmcnt(7)
1855; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
1856; SI-NEXT:    v_cndmask_b32_e32 v6, v21, v6, vcc
1857; SI-NEXT:    s_waitcnt vmcnt(6)
1858; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1859; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
1860; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32
1861; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
1862; SI-NEXT:    v_cvt_f16_f32_e32 v23, v24
1863; SI-NEXT:    v_cvt_f16_f32_e32 v24, v25
1864; SI-NEXT:    v_cndmask_b32_e32 v7, v22, v7, vcc
1865; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
1866; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
1867; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
1868; SI-NEXT:    s_waitcnt vmcnt(7)
1869; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
1870; SI-NEXT:    v_cndmask_b32_e32 v8, v23, v8, vcc
1871; SI-NEXT:    v_cvt_f16_f32_e32 v23, v26
1872; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
1873; SI-NEXT:    s_waitcnt vmcnt(5)
1874; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v33
1875; SI-NEXT:    v_cndmask_b32_e32 v9, v24, v9, vcc
1876; SI-NEXT:    v_cvt_f16_f32_e32 v24, v27
1877; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
1878; SI-NEXT:    v_cvt_f16_f32_e32 v19, v28
1879; SI-NEXT:    v_cndmask_b32_e32 v10, v23, v10, vcc
1880; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
1881; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
1882; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
1883; SI-NEXT:    v_cvt_f16_f32_e32 v20, v29
1884; SI-NEXT:    v_cndmask_b32_e32 v11, v24, v11, vcc
1885; SI-NEXT:    s_waitcnt vmcnt(3)
1886; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
1887; SI-NEXT:    v_cvt_f16_f32_e32 v17, v30
1888; SI-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
1889; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
1890; SI-NEXT:    s_waitcnt vmcnt(1)
1891; SI-NEXT:    v_cvt_f16_f32_e32 v18, v21
1892; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
1893; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
1894; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
1895; SI-NEXT:    v_cndmask_b32_e32 v13, v20, v13, vcc
1896; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1897; SI-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
1898; SI-NEXT:    s_waitcnt vmcnt(0)
1899; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
1900; SI-NEXT:    v_cndmask_b32_e32 v15, v18, v15, vcc
1901; SI-NEXT:    s_setpc_b64 s[30:31]
1902;
1903; VI-LABEL: v_vselect_v16f16:
1904; VI:       ; %bb.0:
1905; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1906; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
1907; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
1908; VI-NEXT:    s_mov_b64 exec, s[4:5]
1909; VI-NEXT:    v_writelane_b32 v31, s30, 0
1910; VI-NEXT:    v_writelane_b32 v31, s31, 1
1911; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
1912; VI-NEXT:    v_cmp_eq_u32_e64 s[18:19], 0, v17
1913; VI-NEXT:    v_cmp_eq_u32_e64 s[30:31], 0, v29
1914; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
1915; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
1916; VI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v18
1917; VI-NEXT:    v_cmp_eq_u32_e64 s[28:29], 0, v27
1918; VI-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[30:31]
1919; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
1920; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
1921; VI-NEXT:    v_cmp_eq_u32_e64 s[20:21], 0, v19
1922; VI-NEXT:    v_cmp_eq_u32_e64 s[26:27], 0, v25
1923; VI-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s[28:29]
1924; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v4
1925; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
1926; VI-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v20
1927; VI-NEXT:    v_cmp_eq_u32_e64 s[24:25], 0, v23
1928; VI-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[26:27]
1929; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
1930; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
1931; VI-NEXT:    v_cmp_eq_u32_e64 s[22:23], 0, v21
1932; VI-NEXT:    v_cndmask_b32_e64 v19, v20, v19, s[24:25]
1933; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
1934; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
1935; VI-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v22
1936; VI-NEXT:    v_cndmask_b32_e64 v20, v21, v20, s[22:23]
1937; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
1938; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
1939; VI-NEXT:    v_cndmask_b32_e64 v21, v22, v21, s[20:21]
1940; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v0
1941; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
1942; VI-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
1943; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
1944; VI-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[18:19]
1945; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
1946; VI-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
1947; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v22
1948; VI-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[8:9]
1949; VI-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1950; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
1951; VI-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
1952; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
1953; VI-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v24
1954; VI-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[10:11]
1955; VI-NEXT:    v_cmp_eq_u32_e64 s[14:15], 0, v26
1956; VI-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
1957; VI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v28
1958; VI-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[14:15]
1959; VI-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[16:17]
1960; VI-NEXT:    v_readlane_b32 s31, v31, 1
1961; VI-NEXT:    v_readlane_b32 s30, v31, 0
1962; VI-NEXT:    s_waitcnt vmcnt(0)
1963; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
1964; VI-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc
1965; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v21
1966; VI-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1967; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v20
1968; VI-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1969; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
1970; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1971; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v18
1972; VI-NEXT:    v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1973; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v17
1974; VI-NEXT:    v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1975; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
1976; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
1977; VI-NEXT:    v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1978; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1979; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
1980; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1981; VI-NEXT:    s_mov_b64 exec, s[4:5]
1982; VI-NEXT:    s_waitcnt vmcnt(0)
1983; VI-NEXT:    s_setpc_b64 s[30:31]
1984;
1985; GFX11-LABEL: v_vselect_v16f16:
1986; GFX11:       ; %bb.0:
1987; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1988; GFX11-NEXT:    scratch_load_b32 v31, off, s32
1989; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v30
1990; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
1991; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
1992; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
1993; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
1994; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
1995; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
1996; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
1997; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
1998; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
1999; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
2000; GFX11-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
2001; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
2002; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
2003; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
2004; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
2005; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
2006; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
2007; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
2008; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
2009; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
2010; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
2011; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
2012; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
2013; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v22
2014; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
2015; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v20
2016; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
2017; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
2018; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
2019; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
2020; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
2021; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v29
2022; GFX11-NEXT:    v_cndmask_b32_e32 v8, v35, v34, vcc_lo
2023; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
2024; GFX11-NEXT:    v_cndmask_b32_e32 v9, v37, v36, vcc_lo
2025; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
2026; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2027; GFX11-NEXT:    v_perm_b32 v6, v8, v6, 0x5040100
2028; GFX11-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
2029; GFX11-NEXT:    v_cndmask_b32_e32 v10, v39, v38, vcc_lo
2030; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
2031; GFX11-NEXT:    v_cndmask_b32_e32 v11, v53, v52, vcc_lo
2032; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v17
2033; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2034; GFX11-NEXT:    v_perm_b32 v4, v10, v4, 0x5040100
2035; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
2036; GFX11-NEXT:    v_cndmask_b32_e32 v12, v55, v54, vcc_lo
2037; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v21
2038; GFX11-NEXT:    v_cndmask_b32_e32 v13, v51, v50, vcc_lo
2039; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v23
2040; GFX11-NEXT:    v_cndmask_b32_e32 v14, v49, v48, vcc_lo
2041; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2042; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
2043; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
2044; GFX11-NEXT:    s_waitcnt vmcnt(0)
2045; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
2046; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v32, vcc_lo
2047; GFX11-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
2048; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2049; GFX11-NEXT:    v_perm_b32 v7, v11, v7, 0x5040100
2050; GFX11-NEXT:    s_setpc_b64 s[30:31]
2051  %cmp = icmp eq <16 x i32> %cond, zeroinitializer
2052  %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b
2053  ret <16 x half> %select
2054}
2055
2056define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
2057; SI-LABEL: v_select_v32f16:
2058; SI:       ; %bb.0:
2059; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2060; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
2061; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
2062; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
2063; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
2064; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
2065; SI-NEXT:    v_or_b32_e32 v20, v20, v21
2066; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32
2067; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2068; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2069; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2070; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2071; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2072; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
2073; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
2074; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2075; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2076; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2077; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2078; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2079; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2080; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
2081; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
2082; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
2083; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
2084; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2085; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2086; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2087; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2088; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
2089; SI-NEXT:    v_or_b32_e32 v12, v12, v13
2090; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
2091; SI-NEXT:    v_or_b32_e32 v10, v10, v11
2092; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
2093; SI-NEXT:    v_or_b32_e32 v8, v8, v9
2094; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
2095; SI-NEXT:    v_or_b32_e32 v6, v6, v7
2096; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
2097; SI-NEXT:    v_or_b32_e32 v4, v4, v5
2098; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
2099; SI-NEXT:    v_or_b32_e32 v2, v2, v3
2100; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
2101; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2102; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
2103; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
2104; SI-NEXT:    v_or_b32_e32 v22, v22, v23
2105; SI-NEXT:    v_cvt_f16_f32_e32 v23, v30
2106; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
2107; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
2108; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
2109; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
2110; SI-NEXT:    v_or_b32_e32 v24, v24, v25
2111; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
2112; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
2113; SI-NEXT:    v_or_b32_e32 v26, v26, v27
2114; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
2115; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
2116; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
2117; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
2118; SI-NEXT:    v_or_b32_e32 v28, v28, v29
2119; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
2120; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
2121; SI-NEXT:    v_or_b32_e32 v18, v18, v19
2122; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
2123; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
2124; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
2125; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
2126; SI-NEXT:    v_or_b32_e32 v16, v16, v17
2127; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:124
2128; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2129; SI-NEXT:    v_or_b32_e32 v14, v14, v15
2130; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
2131; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
2132; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:100
2133; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
2134; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
2135; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
2136; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
2137; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
2138; SI-NEXT:    s_waitcnt vmcnt(14)
2139; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
2140; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
2141; SI-NEXT:    v_or_b32_e32 v21, v23, v21
2142; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:128
2143; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
2144; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
2145; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
2146; SI-NEXT:    s_waitcnt vmcnt(14)
2147; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2148; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
2149; SI-NEXT:    s_waitcnt vmcnt(13)
2150; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2151; SI-NEXT:    s_waitcnt vmcnt(12)
2152; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2153; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
2154; SI-NEXT:    s_waitcnt vmcnt(11)
2155; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2156; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2157; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2158; SI-NEXT:    s_waitcnt vmcnt(10)
2159; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2160; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2161; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2162; SI-NEXT:    s_waitcnt vmcnt(9)
2163; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
2164; SI-NEXT:    s_waitcnt vmcnt(8)
2165; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
2166; SI-NEXT:    s_waitcnt vmcnt(7)
2167; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
2168; SI-NEXT:    s_waitcnt vmcnt(6)
2169; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
2170; SI-NEXT:    s_waitcnt vmcnt(5)
2171; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2172; SI-NEXT:    s_waitcnt vmcnt(4)
2173; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2174; SI-NEXT:    s_waitcnt vmcnt(0)
2175; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
2176; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
2177; SI-NEXT:    v_or_b32_e32 v23, v25, v23
2178; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
2179; SI-NEXT:    s_waitcnt vmcnt(0)
2180; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
2181; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
2182; SI-NEXT:    v_or_b32_e32 v25, v27, v25
2183; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
2184; SI-NEXT:    s_waitcnt vmcnt(0)
2185; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
2186; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
2187; SI-NEXT:    v_or_b32_e32 v27, v29, v27
2188; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
2189; SI-NEXT:    s_waitcnt vmcnt(0)
2190; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
2191; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
2192; SI-NEXT:    v_or_b32_e32 v29, v30, v29
2193; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
2194; SI-NEXT:    s_waitcnt vmcnt(0)
2195; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
2196; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
2197; SI-NEXT:    v_or_b32_e32 v30, v31, v30
2198; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
2199; SI-NEXT:    s_waitcnt vmcnt(0)
2200; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2201; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
2202; SI-NEXT:    v_or_b32_e32 v31, v32, v31
2203; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
2204; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
2205; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
2206; SI-NEXT:    s_waitcnt vmcnt(0)
2207; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2208; SI-NEXT:    v_or_b32_e32 v19, v32, v19
2209; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
2210; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
2211; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
2212; SI-NEXT:    s_waitcnt vmcnt(0)
2213; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2214; SI-NEXT:    v_or_b32_e32 v17, v32, v17
2215; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
2216; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
2217; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
2218; SI-NEXT:    s_waitcnt vmcnt(0)
2219; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2220; SI-NEXT:    v_or_b32_e32 v15, v32, v15
2221; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
2222; SI-NEXT:    s_waitcnt vmcnt(0)
2223; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2224; SI-NEXT:    v_or_b32_e32 v13, v32, v13
2225; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
2226; SI-NEXT:    s_waitcnt vmcnt(0)
2227; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2228; SI-NEXT:    v_or_b32_e32 v11, v32, v11
2229; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
2230; SI-NEXT:    s_waitcnt vmcnt(0)
2231; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2232; SI-NEXT:    v_or_b32_e32 v9, v32, v9
2233; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
2234; SI-NEXT:    s_waitcnt vmcnt(0)
2235; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2236; SI-NEXT:    v_or_b32_e32 v7, v32, v7
2237; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
2238; SI-NEXT:    s_waitcnt vmcnt(0)
2239; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2240; SI-NEXT:    v_or_b32_e32 v5, v32, v5
2241; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
2242; SI-NEXT:    s_waitcnt vmcnt(0)
2243; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2244; SI-NEXT:    v_or_b32_e32 v3, v32, v3
2245; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
2246; SI-NEXT:    s_waitcnt vmcnt(0)
2247; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2248; SI-NEXT:    v_or_b32_e32 v1, v32, v1
2249; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
2250; SI-NEXT:    s_waitcnt vmcnt(0)
2251; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
2252; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
2253; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
2254; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
2255; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
2256; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
2257; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
2258; SI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
2259; SI-NEXT:    v_cndmask_b32_e32 v15, v15, v14, vcc
2260; SI-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
2261; SI-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
2262; SI-NEXT:    v_cndmask_b32_e32 v31, v31, v20, vcc
2263; SI-NEXT:    v_cndmask_b32_e32 v30, v30, v22, vcc
2264; SI-NEXT:    v_cndmask_b32_e32 v29, v29, v24, vcc
2265; SI-NEXT:    v_cndmask_b32_e32 v27, v27, v26, vcc
2266; SI-NEXT:    v_cndmask_b32_e32 v32, v25, v28, vcc
2267; SI-NEXT:    v_cndmask_b32_e32 v33, v23, v21, vcc
2268; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
2269; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
2270; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
2271; SI-NEXT:    v_cvt_f32_f16_e32 v6, v7
2272; SI-NEXT:    v_cvt_f32_f16_e32 v8, v9
2273; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2274; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2275; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
2276; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
2277; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
2278; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
2279; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
2280; SI-NEXT:    v_cvt_f32_f16_e32 v12, v13
2281; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
2282; SI-NEXT:    v_cvt_f32_f16_e32 v14, v15
2283; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
2284; SI-NEXT:    v_cvt_f32_f16_e32 v16, v17
2285; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
2286; SI-NEXT:    v_cvt_f32_f16_e32 v18, v19
2287; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
2288; SI-NEXT:    v_cvt_f32_f16_e32 v20, v31
2289; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v31
2290; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v30
2291; SI-NEXT:    v_cvt_f32_f16_e32 v24, v29
2292; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v29
2293; SI-NEXT:    v_cvt_f32_f16_e32 v26, v27
2294; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
2295; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
2296; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
2297; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2298; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2299; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
2300; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2301; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2302; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
2303; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
2304; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
2305; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
2306; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
2307; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
2308; SI-NEXT:    v_cvt_f32_f16_e32 v22, v30
2309; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
2310; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
2311; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
2312; SI-NEXT:    v_cvt_f32_f16_e32 v28, v32
2313; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
2314; SI-NEXT:    v_cvt_f32_f16_e32 v30, v33
2315; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2316; SI-NEXT:    s_setpc_b64 s[30:31]
2317;
2318; VI-LABEL: v_select_v32f16:
2319; VI:       ; %bb.0:
2320; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2321; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
2322; VI-NEXT:    s_waitcnt vmcnt(0)
2323; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2324; VI-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
2325; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32
2326; VI-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
2327; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
2328; VI-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
2329; VI-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
2330; VI-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
2331; VI-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
2332; VI-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
2333; VI-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
2334; VI-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
2335; VI-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
2336; VI-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
2337; VI-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
2338; VI-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
2339; VI-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
2340; VI-NEXT:    s_waitcnt vmcnt(0)
2341; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
2342; VI-NEXT:    s_setpc_b64 s[30:31]
2343;
2344; GFX11-LABEL: v_select_v32f16:
2345; GFX11:       ; %bb.0:
2346; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2347; GFX11-NEXT:    s_clause 0x1
2348; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
2349; GFX11-NEXT:    scratch_load_b32 v32, off, s32
2350; GFX11-NEXT:    s_waitcnt vmcnt(1)
2351; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
2352; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
2353; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v3
2354; GFX11-NEXT:    v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v5, v21, v5
2355; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
2356; GFX11-NEXT:    v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v9, v25, v9
2357; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
2358; GFX11-NEXT:    v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v13, v29, v13
2359; GFX11-NEXT:    s_waitcnt vmcnt(0)
2360; GFX11-NEXT:    v_dual_cndmask_b32 v14, v30, v14 :: v_dual_cndmask_b32 v15, v32, v15
2361; GFX11-NEXT:    s_setpc_b64 s[30:31]
2362  %cmp = icmp eq i32 %cond, 0
2363  %select = select i1 %cmp, <32 x half> %a, <32 x half> %b
2364  ret <32 x half> %select
2365}
2366
2367define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> %cond) {
2368; SI-LABEL: v_vselect_v32f16:
2369; SI:       ; %bb.0:
2370; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2371; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
2372; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2373; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2374; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2375; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2376; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2377; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2378; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2379; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2380; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2381; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2382; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2383; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2384; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
2385; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
2386; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
2387; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2388; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2389; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2390; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2391; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
2392; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
2393; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
2394; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
2395; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
2396; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
2397; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
2398; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
2399; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
2400; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
2401; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
2402; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
2403; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
2404; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
2405; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
2406; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
2407; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
2408; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
2409; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
2410; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
2411; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
2412; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
2413; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
2414; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
2415; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
2416; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
2417; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
2418; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
2419; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
2420; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
2421; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
2422; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
2423; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
2424; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
2425; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
2426; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
2427; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
2428; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
2429; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
2430; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
2431; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
2432; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
2433; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
2434; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
2435; SI-NEXT:    s_waitcnt vmcnt(1)
2436; SI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v31
2437; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136
2438; SI-NEXT:    s_waitcnt vmcnt(1)
2439; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
2440; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
2441; SI-NEXT:    s_waitcnt vmcnt(0)
2442; SI-NEXT:    v_cmp_eq_u32_e64 s[14:15], 0, v31
2443; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
2444; SI-NEXT:    s_waitcnt vmcnt(0)
2445; SI-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v31
2446; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
2447; SI-NEXT:    s_waitcnt vmcnt(0)
2448; SI-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v31
2449; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
2450; SI-NEXT:    s_waitcnt vmcnt(0)
2451; SI-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v31
2452; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:152
2453; SI-NEXT:    s_waitcnt vmcnt(0)
2454; SI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v31
2455; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:156
2456; SI-NEXT:    s_waitcnt vmcnt(0)
2457; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v31
2458; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
2459; SI-NEXT:    s_waitcnt vmcnt(0)
2460; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2461; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
2462; SI-NEXT:    s_waitcnt vmcnt(0)
2463; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2464; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2465; SI-NEXT:    v_cndmask_b32_e64 v0, v31, v0, s[16:17]
2466; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
2467; SI-NEXT:    s_waitcnt vmcnt(0)
2468; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2469; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2470; SI-NEXT:    v_cndmask_b32_e64 v1, v31, v1, s[14:15]
2471; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
2472; SI-NEXT:    s_waitcnt vmcnt(0)
2473; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2474; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2475; SI-NEXT:    v_cndmask_b32_e64 v2, v31, v2, s[12:13]
2476; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
2477; SI-NEXT:    s_waitcnt vmcnt(0)
2478; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2479; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2480; SI-NEXT:    v_cndmask_b32_e64 v3, v31, v3, s[10:11]
2481; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
2482; SI-NEXT:    s_waitcnt vmcnt(0)
2483; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2484; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2485; SI-NEXT:    v_cndmask_b32_e64 v4, v31, v4, s[8:9]
2486; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
2487; SI-NEXT:    s_waitcnt vmcnt(0)
2488; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2489; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2490; SI-NEXT:    v_cndmask_b32_e64 v5, v31, v5, s[6:7]
2491; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
2492; SI-NEXT:    s_waitcnt vmcnt(0)
2493; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2494; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2495; SI-NEXT:    v_cndmask_b32_e64 v6, v31, v6, s[4:5]
2496; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
2497; SI-NEXT:    s_waitcnt vmcnt(0)
2498; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2499; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2500; SI-NEXT:    v_cndmask_b32_e32 v7, v31, v7, vcc
2501; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:164
2502; SI-NEXT:    s_waitcnt vmcnt(0)
2503; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2504; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
2505; SI-NEXT:    s_waitcnt vmcnt(0)
2506; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2507; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2508; SI-NEXT:    v_cndmask_b32_e32 v8, v31, v8, vcc
2509; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:168
2510; SI-NEXT:    s_waitcnt vmcnt(0)
2511; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2512; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
2513; SI-NEXT:    s_waitcnt vmcnt(0)
2514; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2515; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2516; SI-NEXT:    v_cndmask_b32_e32 v9, v31, v9, vcc
2517; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172
2518; SI-NEXT:    s_waitcnt vmcnt(0)
2519; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2520; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
2521; SI-NEXT:    s_waitcnt vmcnt(0)
2522; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2523; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2524; SI-NEXT:    v_cndmask_b32_e32 v10, v31, v10, vcc
2525; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
2526; SI-NEXT:    s_waitcnt vmcnt(0)
2527; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2528; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
2529; SI-NEXT:    s_waitcnt vmcnt(0)
2530; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2531; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2532; SI-NEXT:    v_cndmask_b32_e32 v11, v31, v11, vcc
2533; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
2534; SI-NEXT:    s_waitcnt vmcnt(0)
2535; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2536; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
2537; SI-NEXT:    s_waitcnt vmcnt(0)
2538; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2539; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2540; SI-NEXT:    v_cndmask_b32_e32 v12, v31, v12, vcc
2541; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184
2542; SI-NEXT:    s_waitcnt vmcnt(0)
2543; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2544; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
2545; SI-NEXT:    s_waitcnt vmcnt(0)
2546; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2547; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2548; SI-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc
2549; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:188
2550; SI-NEXT:    s_waitcnt vmcnt(0)
2551; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2552; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
2553; SI-NEXT:    s_waitcnt vmcnt(0)
2554; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2555; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2556; SI-NEXT:    v_cndmask_b32_e32 v14, v31, v14, vcc
2557; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:192
2558; SI-NEXT:    s_waitcnt vmcnt(0)
2559; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2560; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
2561; SI-NEXT:    s_waitcnt vmcnt(0)
2562; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2563; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2564; SI-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc
2565; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:196
2566; SI-NEXT:    s_waitcnt vmcnt(0)
2567; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2568; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
2569; SI-NEXT:    s_waitcnt vmcnt(0)
2570; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2571; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2572; SI-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
2573; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
2574; SI-NEXT:    s_waitcnt vmcnt(0)
2575; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2576; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
2577; SI-NEXT:    s_waitcnt vmcnt(0)
2578; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2579; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2580; SI-NEXT:    v_cndmask_b32_e32 v17, v31, v17, vcc
2581; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:204
2582; SI-NEXT:    s_waitcnt vmcnt(0)
2583; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2584; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
2585; SI-NEXT:    s_waitcnt vmcnt(0)
2586; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2587; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2588; SI-NEXT:    v_cndmask_b32_e32 v18, v31, v18, vcc
2589; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:208
2590; SI-NEXT:    s_waitcnt vmcnt(0)
2591; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2592; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
2593; SI-NEXT:    s_waitcnt vmcnt(0)
2594; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2595; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2596; SI-NEXT:    v_cndmask_b32_e32 v19, v31, v19, vcc
2597; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212
2598; SI-NEXT:    s_waitcnt vmcnt(0)
2599; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2600; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
2601; SI-NEXT:    s_waitcnt vmcnt(0)
2602; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2603; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2604; SI-NEXT:    v_cndmask_b32_e32 v20, v31, v20, vcc
2605; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:216
2606; SI-NEXT:    s_waitcnt vmcnt(0)
2607; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2608; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
2609; SI-NEXT:    s_waitcnt vmcnt(0)
2610; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2611; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2612; SI-NEXT:    v_cndmask_b32_e32 v21, v31, v21, vcc
2613; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
2614; SI-NEXT:    s_waitcnt vmcnt(0)
2615; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2616; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
2617; SI-NEXT:    s_waitcnt vmcnt(0)
2618; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2619; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2620; SI-NEXT:    v_cndmask_b32_e32 v22, v31, v22, vcc
2621; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:224
2622; SI-NEXT:    s_waitcnt vmcnt(0)
2623; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2624; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
2625; SI-NEXT:    s_waitcnt vmcnt(0)
2626; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2627; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2628; SI-NEXT:    v_cndmask_b32_e32 v23, v31, v23, vcc
2629; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:228
2630; SI-NEXT:    s_waitcnt vmcnt(0)
2631; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2632; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
2633; SI-NEXT:    s_waitcnt vmcnt(0)
2634; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2635; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2636; SI-NEXT:    v_cndmask_b32_e32 v24, v31, v24, vcc
2637; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:232
2638; SI-NEXT:    s_waitcnt vmcnt(0)
2639; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2640; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
2641; SI-NEXT:    s_waitcnt vmcnt(0)
2642; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2643; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2644; SI-NEXT:    v_cndmask_b32_e32 v25, v31, v25, vcc
2645; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:236
2646; SI-NEXT:    s_waitcnt vmcnt(0)
2647; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2648; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
2649; SI-NEXT:    s_waitcnt vmcnt(0)
2650; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2651; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2652; SI-NEXT:    v_cndmask_b32_e32 v26, v31, v26, vcc
2653; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:240
2654; SI-NEXT:    s_waitcnt vmcnt(0)
2655; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2656; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
2657; SI-NEXT:    s_waitcnt vmcnt(0)
2658; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2659; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2660; SI-NEXT:    v_cndmask_b32_e32 v27, v31, v27, vcc
2661; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:244
2662; SI-NEXT:    s_waitcnt vmcnt(0)
2663; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2664; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
2665; SI-NEXT:    s_waitcnt vmcnt(0)
2666; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2667; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2668; SI-NEXT:    v_cndmask_b32_e32 v28, v31, v28, vcc
2669; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:248
2670; SI-NEXT:    s_waitcnt vmcnt(0)
2671; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2672; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
2673; SI-NEXT:    s_waitcnt vmcnt(0)
2674; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2675; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2676; SI-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
2677; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
2678; SI-NEXT:    s_waitcnt vmcnt(0)
2679; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2680; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
2681; SI-NEXT:    s_waitcnt vmcnt(0)
2682; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2683; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2684; SI-NEXT:    v_cndmask_b32_e32 v30, v31, v30, vcc
2685; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:256
2686; SI-NEXT:    s_waitcnt vmcnt(0)
2687; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2688; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
2689; SI-NEXT:    s_waitcnt vmcnt(0)
2690; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
2691; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
2692; SI-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
2693; SI-NEXT:    s_setpc_b64 s[30:31]
2694;
2695; VI-LABEL: v_vselect_v32f16:
2696; VI:       ; %bb.0:
2697; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2698; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
2699; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
2700; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
2701; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
2702; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
2703; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
2704; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
2705; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
2706; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
2707; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
2708; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
2709; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
2710; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
2711; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:120
2712; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
2713; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
2714; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
2715; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
2716; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
2717; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:72
2718; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
2719; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:128
2720; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:64
2721; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:56
2722; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
2723; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
2724; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
2725; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
2726; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
2727; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
2728; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124
2729; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:116
2730; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
2731; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v30
2732; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v13
2733; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v29
2734; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v12
2735; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v28
2736; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v11
2737; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v27
2738; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v10
2739; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v26
2740; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
2741; VI-NEXT:    s_waitcnt vmcnt(14)
2742; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v36
2743; VI-NEXT:    v_cndmask_b32_e32 v36, v43, v38, vcc
2744; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v35
2745; VI-NEXT:    v_cndmask_b32_e32 v35, v45, v44, vcc
2746; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v34
2747; VI-NEXT:    v_cndmask_b32_e32 v34, v47, v46, vcc
2748; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v33
2749; VI-NEXT:    v_cndmask_b32_e32 v33, v57, v56, vcc
2750; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
2751; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
2752; VI-NEXT:    v_cndmask_b32_e32 v32, v59, v58, vcc
2753; VI-NEXT:    s_waitcnt vmcnt(13)
2754; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
2755; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v8
2756; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v24
2757; VI-NEXT:    v_cndmask_b32_e32 v38, v38, v60, vcc
2758; VI-NEXT:    s_waitcnt vmcnt(12)
2759; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v39
2760; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
2761; VI-NEXT:    v_cndmask_b32_e32 v39, v44, v43, vcc
2762; VI-NEXT:    s_waitcnt vmcnt(11)
2763; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v37
2764; VI-NEXT:    s_waitcnt vmcnt(10)
2765; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v55
2766; VI-NEXT:    v_cndmask_b32_e32 v31, v31, v45, vcc
2767; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v7
2768; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v23
2769; VI-NEXT:    s_waitcnt vmcnt(9)
2770; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v50
2771; VI-NEXT:    v_cndmask_b32_e32 v50, v43, v55, vcc
2772; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
2773; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v22
2774; VI-NEXT:    s_waitcnt vmcnt(8)
2775; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v53
2776; VI-NEXT:    v_cndmask_b32_e32 v53, v43, v55, vcc
2777; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v5
2778; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v21
2779; VI-NEXT:    s_waitcnt vmcnt(7)
2780; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v54
2781; VI-NEXT:    v_cndmask_b32_e32 v54, v43, v55, vcc
2782; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
2783; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v20
2784; VI-NEXT:    s_waitcnt vmcnt(6)
2785; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v52
2786; VI-NEXT:    v_cndmask_b32_e32 v52, v43, v55, vcc
2787; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v3
2788; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
2789; VI-NEXT:    s_waitcnt vmcnt(5)
2790; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v51
2791; VI-NEXT:    v_cndmask_b32_e32 v51, v43, v55, vcc
2792; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
2793; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
2794; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:108
2795; VI-NEXT:    s_waitcnt vmcnt(5)
2796; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v49
2797; VI-NEXT:    v_cndmask_b32_e32 v49, v43, v55, vcc
2798; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:100
2799; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
2800; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v1
2801; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
2802; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
2803; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
2804; VI-NEXT:    s_waitcnt vmcnt(8)
2805; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v48
2806; VI-NEXT:    v_cndmask_b32_e32 v48, v46, v43, vcc
2807; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
2808; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
2809; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
2810; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v0
2811; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
2812; VI-NEXT:    s_waitcnt vmcnt(10)
2813; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v40
2814; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44
2815; VI-NEXT:    v_cndmask_b32_e32 v46, v58, v46, vcc
2816; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
2817; VI-NEXT:    s_waitcnt vmcnt(11)
2818; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v41
2819; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28
2820; VI-NEXT:    v_cndmask_b32_e32 v15, v37, v15, vcc
2821; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
2822; VI-NEXT:    s_waitcnt vmcnt(12)
2823; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v42
2824; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
2825; VI-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
2826; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
2827; VI-NEXT:    s_waitcnt vmcnt(13)
2828; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v44
2829; VI-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
2830; VI-NEXT:    s_waitcnt vmcnt(12)
2831; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v55
2832; VI-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
2833; VI-NEXT:    s_waitcnt vmcnt(11)
2834; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v45
2835; VI-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
2836; VI-NEXT:    s_waitcnt vmcnt(10)
2837; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v47
2838; VI-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
2839; VI-NEXT:    s_waitcnt vmcnt(9)
2840; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v56
2841; VI-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
2842; VI-NEXT:    s_waitcnt vmcnt(8)
2843; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v43
2844; VI-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
2845; VI-NEXT:    s_waitcnt vmcnt(7)
2846; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v57
2847; VI-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
2848; VI-NEXT:    s_waitcnt vmcnt(6)
2849; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v59
2850; VI-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
2851; VI-NEXT:    s_waitcnt vmcnt(5)
2852; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v40
2853; VI-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
2854; VI-NEXT:    s_waitcnt vmcnt(4)
2855; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v58
2856; VI-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
2857; VI-NEXT:    s_waitcnt vmcnt(3)
2858; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v41
2859; VI-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
2860; VI-NEXT:    s_waitcnt vmcnt(2)
2861; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v37
2862; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
2863; VI-NEXT:    s_waitcnt vmcnt(1)
2864; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v42
2865; VI-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
2866; VI-NEXT:    s_waitcnt vmcnt(0)
2867; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
2868; VI-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
2869; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v46
2870; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
2871; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
2872; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
2873; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
2874; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
2875; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
2876; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
2877; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
2878; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
2879; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
2880; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
2881; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
2882; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
2883; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2884; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
2885; VI-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2886; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v49
2887; VI-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2888; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v51
2889; VI-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2890; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v52
2891; VI-NEXT:    v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2892; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
2893; VI-NEXT:    v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2894; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
2895; VI-NEXT:    v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2896; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v50
2897; VI-NEXT:    v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2898; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v39
2899; VI-NEXT:    v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2900; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v38
2901; VI-NEXT:    v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2902; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
2903; VI-NEXT:    v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2904; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
2905; VI-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2906; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
2907; VI-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2908; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v35
2909; VI-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2910; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v36
2911; VI-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2912; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
2913; VI-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2914; VI-NEXT:    s_waitcnt vmcnt(0)
2915; VI-NEXT:    s_setpc_b64 s[30:31]
2916;
2917; GFX11-LABEL: v_vselect_v32f16:
2918; GFX11:       ; %bb.0:
2919; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920; GFX11-NEXT:    s_clause 0x1f
2921; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:120
2922; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:112
2923; GFX11-NEXT:    scratch_load_b32 v33, off, s32
2924; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:104
2925; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:96
2926; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:88
2927; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:80
2928; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
2929; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
2930; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:56
2931; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
2932; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:40
2933; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
2934; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:24
2935; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:16
2936; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:8
2937; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:124
2938; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:116
2939; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:108
2940; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:100
2941; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:92
2942; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:84
2943; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:76
2944; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:68
2945; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:60
2946; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:52
2947; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:44
2948; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:36
2949; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:28
2950; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:12
2951; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:4
2952; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:20
2953; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:128
2954; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
2955; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v30
2956; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v13
2957; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v29
2958; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v12
2959; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v28
2960; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v11
2961; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v27
2962; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v10
2963; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v26
2964; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v9
2965; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v25
2966; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v8
2967; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v24
2968; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v7
2969; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v23
2970; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 16, v6
2971; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 16, v22
2972; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v5
2973; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v21
2974; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v4
2975; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 16, v20
2976; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v3
2977; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v19
2978; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v2
2979; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v18
2980; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v1
2981; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v15
2982; GFX11-NEXT:    s_waitcnt vmcnt(32)
2983; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
2984; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
2985; GFX11-NEXT:    v_cndmask_b32_e32 v97, v98, v97, vcc_lo
2986; GFX11-NEXT:    s_waitcnt vmcnt(31)
2987; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v32
2988; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v0
2989; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
2990; GFX11-NEXT:    v_cndmask_b32_e32 v99, v100, v99, vcc_lo
2991; GFX11-NEXT:    s_waitcnt vmcnt(29)
2992; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v34
2993; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v33
2994; GFX11-NEXT:    v_cndmask_b32_e32 v34, v102, v101, vcc_lo
2995; GFX11-NEXT:    s_waitcnt vmcnt(28)
2996; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v35
2997; GFX11-NEXT:    v_cndmask_b32_e32 v35, v112, v103, vcc_lo
2998; GFX11-NEXT:    s_waitcnt vmcnt(27)
2999; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v36
3000; GFX11-NEXT:    v_cndmask_b32_e32 v36, v114, v113, vcc_lo
3001; GFX11-NEXT:    s_waitcnt vmcnt(26)
3002; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v37
3003; GFX11-NEXT:    v_cndmask_b32_e32 v37, v116, v115, vcc_lo
3004; GFX11-NEXT:    s_waitcnt vmcnt(25)
3005; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v38
3006; GFX11-NEXT:    v_cndmask_b32_e32 v38, v118, v117, vcc_lo
3007; GFX11-NEXT:    s_waitcnt vmcnt(24)
3008; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v39
3009; GFX11-NEXT:    v_cndmask_b32_e32 v39, v128, v119, vcc_lo
3010; GFX11-NEXT:    s_waitcnt vmcnt(23)
3011; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v48
3012; GFX11-NEXT:    v_cndmask_b32_e32 v48, v130, v129, vcc_lo
3013; GFX11-NEXT:    s_waitcnt vmcnt(22)
3014; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v49
3015; GFX11-NEXT:    v_cndmask_b32_e32 v49, v132, v131, vcc_lo
3016; GFX11-NEXT:    s_waitcnt vmcnt(21)
3017; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v50
3018; GFX11-NEXT:    v_cndmask_b32_e32 v50, v134, v133, vcc_lo
3019; GFX11-NEXT:    s_waitcnt vmcnt(20)
3020; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v51
3021; GFX11-NEXT:    v_cndmask_b32_e32 v51, v144, v135, vcc_lo
3022; GFX11-NEXT:    s_waitcnt vmcnt(19)
3023; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v52
3024; GFX11-NEXT:    v_cndmask_b32_e32 v52, v146, v145, vcc_lo
3025; GFX11-NEXT:    s_waitcnt vmcnt(18)
3026; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v53
3027; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v147, vcc_lo
3028; GFX11-NEXT:    s_waitcnt vmcnt(17)
3029; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v54
3030; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v98, vcc_lo
3031; GFX11-NEXT:    s_waitcnt vmcnt(16)
3032; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v55
3033; GFX11-NEXT:    v_cndmask_b32_e32 v15, v33, v15, vcc_lo
3034; GFX11-NEXT:    s_waitcnt vmcnt(15)
3035; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v64
3036; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
3037; GFX11-NEXT:    s_waitcnt vmcnt(14)
3038; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v65
3039; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3040; GFX11-NEXT:    v_perm_b32 v14, v97, v14, 0x5040100
3041; GFX11-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
3042; GFX11-NEXT:    s_waitcnt vmcnt(13)
3043; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v66
3044; GFX11-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
3045; GFX11-NEXT:    s_waitcnt vmcnt(12)
3046; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v67
3047; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3048; GFX11-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
3049; GFX11-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
3050; GFX11-NEXT:    s_waitcnt vmcnt(11)
3051; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v68
3052; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
3053; GFX11-NEXT:    s_waitcnt vmcnt(10)
3054; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v69
3055; GFX11-NEXT:    v_perm_b32 v13, v99, v13, 0x5040100
3056; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3057; GFX11-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
3058; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
3059; GFX11-NEXT:    s_waitcnt vmcnt(9)
3060; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v70
3061; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
3062; GFX11-NEXT:    s_waitcnt vmcnt(8)
3063; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v71
3064; GFX11-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
3065; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3066; GFX11-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
3067; GFX11-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
3068; GFX11-NEXT:    s_waitcnt vmcnt(7)
3069; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v80
3070; GFX11-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
3071; GFX11-NEXT:    s_waitcnt vmcnt(6)
3072; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v81
3073; GFX11-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
3074; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3075; GFX11-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
3076; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
3077; GFX11-NEXT:    s_waitcnt vmcnt(5)
3078; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v82
3079; GFX11-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
3080; GFX11-NEXT:    s_waitcnt vmcnt(4)
3081; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v83
3082; GFX11-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
3083; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3084; GFX11-NEXT:    v_perm_b32 v4, v50, v4, 0x5040100
3085; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
3086; GFX11-NEXT:    s_waitcnt vmcnt(3)
3087; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v84
3088; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
3089; GFX11-NEXT:    s_waitcnt vmcnt(2)
3090; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v85
3091; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
3092; GFX11-NEXT:    s_waitcnt vmcnt(1)
3093; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v86
3094; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
3095; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
3096; GFX11-NEXT:    v_perm_b32 v0, v32, v0, 0x5040100
3097; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
3098; GFX11-NEXT:    s_waitcnt vmcnt(0)
3099; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v87
3100; GFX11-NEXT:    v_perm_b32 v3, v51, v3, 0x5040100
3101; GFX11-NEXT:    v_perm_b32 v2, v52, v2, 0x5040100
3102; GFX11-NEXT:    v_cndmask_b32_e32 v16, v100, v96, vcc_lo
3103; GFX11-NEXT:    v_perm_b32 v1, v31, v1, 0x5040100
3104; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3105; GFX11-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
3106; GFX11-NEXT:    s_setpc_b64 s[30:31]
3107  %cmp = icmp eq <32 x i32> %cond, zeroinitializer
3108  %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b
3109  ret <32 x half> %select
3110}
3111