xref: /llvm-project/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll (revision 26e13091ea5ac3a53d11b50265a506f88129d6ff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
6
7define amdgpu_kernel void @uitofp_i16_to_f16(
8; SI-LABEL: uitofp_i16_to_f16:
9; SI:       ; %bb.0: ; %entry
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
11; SI-NEXT:    s_mov_b32 s7, 0xf000
12; SI-NEXT:    s_mov_b32 s6, -1
13; SI-NEXT:    s_mov_b32 s10, s6
14; SI-NEXT:    s_mov_b32 s11, s7
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b32 s8, s2
17; SI-NEXT:    s_mov_b32 s9, s3
18; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
19; SI-NEXT:    s_mov_b32 s4, s0
20; SI-NEXT:    s_mov_b32 s5, s1
21; SI-NEXT:    s_waitcnt vmcnt(0)
22; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
23; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
24; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: uitofp_i16_to_f16:
28; VI:       ; %bb.0: ; %entry
29; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
30; VI-NEXT:    s_mov_b32 s7, 0xf000
31; VI-NEXT:    s_mov_b32 s6, -1
32; VI-NEXT:    s_mov_b32 s10, s6
33; VI-NEXT:    s_mov_b32 s11, s7
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    s_mov_b32 s8, s2
36; VI-NEXT:    s_mov_b32 s9, s3
37; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
38; VI-NEXT:    s_mov_b32 s4, s0
39; VI-NEXT:    s_mov_b32 s5, s1
40; VI-NEXT:    s_waitcnt vmcnt(0)
41; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
42; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
43; VI-NEXT:    s_endpgm
44;
45; GFX11-TRUE16-LABEL: uitofp_i16_to_f16:
46; GFX11-TRUE16:       ; %bb.0: ; %entry
47; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
48; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
49; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
50; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
51; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
52; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
54; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
55; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
56; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
57; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
58; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
59; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
60; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
61; GFX11-TRUE16-NEXT:    s_endpgm
62;
63; GFX11-FAKE16-LABEL: uitofp_i16_to_f16:
64; GFX11-FAKE16:       ; %bb.0: ; %entry
65; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
66; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
67; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
68; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
69; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
70; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
72; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
73; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
74; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
75; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
76; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
77; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
78; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
79; GFX11-FAKE16-NEXT:    s_endpgm
80    ptr addrspace(1) %r,
81    ptr addrspace(1) %a) {
82entry:
83  %a.val = load i16, ptr addrspace(1) %a
84  %r.val = uitofp i16 %a.val to half
85  store half %r.val, ptr addrspace(1) %r
86  ret void
87}
88
89define amdgpu_kernel void @uitofp_i32_to_f16(
90; SI-LABEL: uitofp_i32_to_f16:
91; SI:       ; %bb.0: ; %entry
92; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
93; SI-NEXT:    s_mov_b32 s7, 0xf000
94; SI-NEXT:    s_mov_b32 s6, -1
95; SI-NEXT:    s_mov_b32 s10, s6
96; SI-NEXT:    s_mov_b32 s11, s7
97; SI-NEXT:    s_waitcnt lgkmcnt(0)
98; SI-NEXT:    s_mov_b32 s8, s2
99; SI-NEXT:    s_mov_b32 s9, s3
100; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
101; SI-NEXT:    s_mov_b32 s4, s0
102; SI-NEXT:    s_mov_b32 s5, s1
103; SI-NEXT:    s_waitcnt vmcnt(0)
104; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
105; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
106; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
107; SI-NEXT:    s_endpgm
108;
109; VI-LABEL: uitofp_i32_to_f16:
110; VI:       ; %bb.0: ; %entry
111; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
112; VI-NEXT:    s_mov_b32 s7, 0xf000
113; VI-NEXT:    s_mov_b32 s6, -1
114; VI-NEXT:    s_mov_b32 s10, s6
115; VI-NEXT:    s_mov_b32 s11, s7
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    s_mov_b32 s8, s2
118; VI-NEXT:    s_mov_b32 s9, s3
119; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
120; VI-NEXT:    s_mov_b32 s4, s0
121; VI-NEXT:    s_mov_b32 s5, s1
122; VI-NEXT:    s_waitcnt vmcnt(0)
123; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
124; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
125; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
126; VI-NEXT:    s_endpgm
127;
128; GFX11-TRUE16-LABEL: uitofp_i32_to_f16:
129; GFX11-TRUE16:       ; %bb.0: ; %entry
130; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
131; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
132; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
133; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
134; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
135; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
137; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
138; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
139; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
140; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
141; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
142; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
143; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
144; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
145; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
146; GFX11-TRUE16-NEXT:    s_endpgm
147;
148; GFX11-FAKE16-LABEL: uitofp_i32_to_f16:
149; GFX11-FAKE16:       ; %bb.0: ; %entry
150; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
151; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
152; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
153; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
154; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
155; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
157; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
158; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
159; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
160; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
161; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
162; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
163; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
164; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
165; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
166; GFX11-FAKE16-NEXT:    s_endpgm
167    ptr addrspace(1) %r,
168    ptr addrspace(1) %a) {
169entry:
170  %a.val = load i32, ptr addrspace(1) %a
171  %r.val = uitofp i32 %a.val to half
172  store half %r.val, ptr addrspace(1) %r
173  ret void
174}
175
176; f16 = uitofp i64 is in uint_to_fp.i64.ll
177
178define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
179; SI-LABEL: uitofp_v2i16_to_v2f16:
180; SI:       ; %bb.0: ; %entry
181; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
182; SI-NEXT:    s_mov_b32 s7, 0xf000
183; SI-NEXT:    s_mov_b32 s6, -1
184; SI-NEXT:    s_mov_b32 s10, s6
185; SI-NEXT:    s_mov_b32 s11, s7
186; SI-NEXT:    s_waitcnt lgkmcnt(0)
187; SI-NEXT:    s_mov_b32 s8, s2
188; SI-NEXT:    s_mov_b32 s9, s3
189; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
190; SI-NEXT:    s_mov_b32 s4, s0
191; SI-NEXT:    s_mov_b32 s5, s1
192; SI-NEXT:    s_waitcnt vmcnt(0)
193; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
194; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
195; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
196; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
197; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
198; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
199; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
200; SI-NEXT:    v_or_b32_e32 v0, v1, v0
201; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
202; SI-NEXT:    s_endpgm
203;
204; VI-LABEL: uitofp_v2i16_to_v2f16:
205; VI:       ; %bb.0: ; %entry
206; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
207; VI-NEXT:    s_mov_b32 s7, 0xf000
208; VI-NEXT:    s_mov_b32 s6, -1
209; VI-NEXT:    s_mov_b32 s10, s6
210; VI-NEXT:    s_mov_b32 s11, s7
211; VI-NEXT:    s_waitcnt lgkmcnt(0)
212; VI-NEXT:    s_mov_b32 s8, s2
213; VI-NEXT:    s_mov_b32 s9, s3
214; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
215; VI-NEXT:    s_mov_b32 s4, s0
216; VI-NEXT:    s_mov_b32 s5, s1
217; VI-NEXT:    s_waitcnt vmcnt(0)
218; VI-NEXT:    v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
219; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
220; VI-NEXT:    v_or_b32_e32 v0, v0, v1
221; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
222; VI-NEXT:    s_endpgm
223;
224; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16:
225; GFX11-TRUE16:       ; %bb.0: ; %entry
226; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
227; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
228; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
229; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
230; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
231; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
233; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
234; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
235; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
236; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
237; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
238; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
239; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
240; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
241; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v1.l
242; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
243; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
244; GFX11-TRUE16-NEXT:    s_endpgm
245;
246; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16:
247; GFX11-FAKE16:       ; %bb.0: ; %entry
248; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
249; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
250; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
251; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
252; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
253; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
255; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
256; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
257; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
258; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
259; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
260; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
261; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
262; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
263; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v1, v1
264; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
265; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
266; GFX11-FAKE16-NEXT:    s_endpgm
267    ptr addrspace(1) %r,
268    ptr addrspace(1) %a) {
269entry:
270  %a.val = load <2 x i16>, ptr addrspace(1) %a
271  %r.val = uitofp <2 x i16> %a.val to <2 x half>
272  store <2 x half> %r.val, ptr addrspace(1) %r
273  ret void
274}
275
276define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
277; SI-LABEL: uitofp_v2i32_to_v2f16:
278; SI:       ; %bb.0: ; %entry
279; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
280; SI-NEXT:    s_mov_b32 s7, 0xf000
281; SI-NEXT:    s_mov_b32 s6, -1
282; SI-NEXT:    s_mov_b32 s10, s6
283; SI-NEXT:    s_mov_b32 s11, s7
284; SI-NEXT:    s_waitcnt lgkmcnt(0)
285; SI-NEXT:    s_mov_b32 s8, s2
286; SI-NEXT:    s_mov_b32 s9, s3
287; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
288; SI-NEXT:    s_mov_b32 s4, s0
289; SI-NEXT:    s_mov_b32 s5, s1
290; SI-NEXT:    s_waitcnt vmcnt(0)
291; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
292; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
293; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
294; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
295; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
296; SI-NEXT:    v_or_b32_e32 v0, v0, v1
297; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; SI-NEXT:    s_endpgm
299;
300; VI-LABEL: uitofp_v2i32_to_v2f16:
301; VI:       ; %bb.0: ; %entry
302; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
303; VI-NEXT:    s_mov_b32 s7, 0xf000
304; VI-NEXT:    s_mov_b32 s6, -1
305; VI-NEXT:    s_mov_b32 s10, s6
306; VI-NEXT:    s_mov_b32 s11, s7
307; VI-NEXT:    s_waitcnt lgkmcnt(0)
308; VI-NEXT:    s_mov_b32 s8, s2
309; VI-NEXT:    s_mov_b32 s9, s3
310; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
311; VI-NEXT:    s_mov_b32 s4, s0
312; VI-NEXT:    s_mov_b32 s5, s1
313; VI-NEXT:    s_waitcnt vmcnt(0)
314; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
315; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
316; VI-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
317; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
318; VI-NEXT:    v_or_b32_e32 v0, v0, v1
319; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
320; VI-NEXT:    s_endpgm
321;
322; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16:
323; GFX11-TRUE16:       ; %bb.0: ; %entry
324; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
325; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
326; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
327; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
328; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
329; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
331; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
332; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
333; GFX11-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
334; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
335; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
336; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
337; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v0
338; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
339; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
340; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
341; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
342; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
343; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
344; GFX11-TRUE16-NEXT:    s_endpgm
345;
346; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16:
347; GFX11-FAKE16:       ; %bb.0: ; %entry
348; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
349; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
350; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
351; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
352; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
353; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
355; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
356; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
357; GFX11-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
358; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
359; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
360; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
361; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
362; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
363; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
364; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
365; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
366; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
367; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
368; GFX11-FAKE16-NEXT:    s_endpgm
369    ptr addrspace(1) %r,
370    ptr addrspace(1) %a) {
371entry:
372  %a.val = load <2 x i32>, ptr addrspace(1) %a
373  %r.val = uitofp <2 x i32> %a.val to <2 x half>
374  store <2 x half> %r.val, ptr addrspace(1) %r
375  ret void
376}
377
378define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
379; SI-LABEL: s_uint_to_fp_i1_to_f16:
380; SI:       ; %bb.0:
381; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
382; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
383; SI-NEXT:    s_mov_b32 s3, 0xf000
384; SI-NEXT:    s_mov_b32 s2, -1
385; SI-NEXT:    s_mov_b32 s6, s2
386; SI-NEXT:    s_mov_b32 s7, s3
387; SI-NEXT:    s_waitcnt lgkmcnt(0)
388; SI-NEXT:    s_mov_b32 s12, s10
389; SI-NEXT:    s_mov_b32 s13, s11
390; SI-NEXT:    s_mov_b32 s14, s2
391; SI-NEXT:    s_mov_b32 s15, s3
392; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
393; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
394; SI-NEXT:    s_waitcnt vmcnt(1)
395; SI-NEXT:    v_cmp_le_f32_e32 vcc, 1.0, v0
396; SI-NEXT:    s_waitcnt vmcnt(0)
397; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
398; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
399; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
400; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
401; SI-NEXT:    s_mov_b32 s0, s8
402; SI-NEXT:    s_mov_b32 s1, s9
403; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
404; SI-NEXT:    s_endpgm
405;
406; VI-LABEL: s_uint_to_fp_i1_to_f16:
407; VI:       ; %bb.0:
408; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
409; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
410; VI-NEXT:    s_mov_b32 s3, 0xf000
411; VI-NEXT:    s_mov_b32 s2, -1
412; VI-NEXT:    s_mov_b32 s6, s2
413; VI-NEXT:    s_mov_b32 s7, s3
414; VI-NEXT:    s_waitcnt lgkmcnt(0)
415; VI-NEXT:    s_mov_b32 s12, s10
416; VI-NEXT:    s_mov_b32 s13, s11
417; VI-NEXT:    s_mov_b32 s14, s2
418; VI-NEXT:    s_mov_b32 s15, s3
419; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
420; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
421; VI-NEXT:    s_waitcnt vmcnt(1)
422; VI-NEXT:    v_cmp_le_f32_e32 vcc, 1.0, v0
423; VI-NEXT:    s_waitcnt vmcnt(0)
424; VI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
425; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
426; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
427; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
428; VI-NEXT:    s_mov_b32 s0, s8
429; VI-NEXT:    s_mov_b32 s1, s9
430; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
431; VI-NEXT:    s_endpgm
432;
433; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16:
434; GFX11-TRUE16:       ; %bb.0:
435; GFX11-TRUE16-NEXT:    s_clause 0x1
436; GFX11-TRUE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
437; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
438; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
439; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
440; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
441; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
442; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s6
443; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s7
444; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s10
446; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s11
447; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
448; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
449; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s8
450; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s9
451; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
452; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
453; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
454; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
455; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
456; GFX11-TRUE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
457; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
458; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
459; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
460; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
461; GFX11-TRUE16-NEXT:    s_endpgm
462;
463; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16:
464; GFX11-FAKE16:       ; %bb.0:
465; GFX11-FAKE16-NEXT:    s_clause 0x1
466; GFX11-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
467; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
468; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
469; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
470; GFX11-FAKE16-NEXT:    s_mov_b32 s2, s6
471; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s7
472; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s6
473; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s7
474; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s10
476; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s11
477; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
478; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
479; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s8
480; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s9
481; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
482; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
483; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
484; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
485; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
486; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
487; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
488; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
489; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
490; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
491; GFX11-FAKE16-NEXT:    s_endpgm
492  %a = load float, ptr addrspace(1) %in0
493  %b = load float, ptr addrspace(1) %in1
494  %acmp = fcmp oge float %a, 0.000000e+00
495  %bcmp = fcmp oge float %b, 1.000000e+00
496  %result = xor i1 %acmp, %bcmp
497  %fp = uitofp i1 %result to half
498  store half %fp, ptr addrspace(1) %out
499  ret void
500}
501
502; f16 = uitofp i64 is in uint_to_fp.i64.ll
503