xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
6
7
8define amdgpu_kernel void @fptoui_f16_to_i16(
9; SI-LABEL: fptoui_f16_to_i16:
10; SI:       ; %bb.0: ; %entry
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
24; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
25; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: fptoui_f16_to_i16:
29; VI:       ; %bb.0: ; %entry
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_cvt_u16_f16_e32 v0, v0
43; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
44; VI-NEXT:    s_endpgm
45;
46; GFX11-TRUE16-LABEL: fptoui_f16_to_i16:
47; GFX11-TRUE16:       ; %bb.0: ; %entry
48; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
49; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
50; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
51; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
52; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
53; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
55; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
56; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
57; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
58; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
59; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
60; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
61; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
62; GFX11-TRUE16-NEXT:    s_endpgm
63;
64; GFX11-FAKE16-LABEL: fptoui_f16_to_i16:
65; GFX11-FAKE16:       ; %bb.0: ; %entry
66; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
67; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
68; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
69; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
70; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
71; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
73; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
74; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
75; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
76; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
77; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
78; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v0, v0
79; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
80; GFX11-FAKE16-NEXT:    s_endpgm
81    ptr addrspace(1) %r,
82    ptr addrspace(1) %a) {
83entry:
84  %a.val = load half, ptr addrspace(1) %a
85  %r.val = fptoui half %a.val to i16
86  store i16 %r.val, ptr addrspace(1) %r
87  ret void
88}
89
90define amdgpu_kernel void @fptoui_f16_to_i32(
91; SI-LABEL: fptoui_f16_to_i32:
92; SI:       ; %bb.0: ; %entry
93; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
94; SI-NEXT:    s_mov_b32 s7, 0xf000
95; SI-NEXT:    s_mov_b32 s6, -1
96; SI-NEXT:    s_mov_b32 s10, s6
97; SI-NEXT:    s_mov_b32 s11, s7
98; SI-NEXT:    s_waitcnt lgkmcnt(0)
99; SI-NEXT:    s_mov_b32 s8, s2
100; SI-NEXT:    s_mov_b32 s9, s3
101; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
102; SI-NEXT:    s_mov_b32 s4, s0
103; SI-NEXT:    s_mov_b32 s5, s1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
106; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
107; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: fptoui_f16_to_i32:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
113; VI-NEXT:    s_mov_b32 s7, 0xf000
114; VI-NEXT:    s_mov_b32 s6, -1
115; VI-NEXT:    s_mov_b32 s10, s6
116; VI-NEXT:    s_mov_b32 s11, s7
117; VI-NEXT:    s_waitcnt lgkmcnt(0)
118; VI-NEXT:    s_mov_b32 s8, s2
119; VI-NEXT:    s_mov_b32 s9, s3
120; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
121; VI-NEXT:    s_mov_b32 s4, s0
122; VI-NEXT:    s_mov_b32 s5, s1
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
125; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; VI-NEXT:    s_endpgm
128;
129; GFX11-TRUE16-LABEL: fptoui_f16_to_i32:
130; GFX11-TRUE16:       ; %bb.0: ; %entry
131; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
132; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
133; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
134; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
135; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
136; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
138; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
139; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
140; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
141; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
142; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
144; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
145; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
146; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
147; GFX11-TRUE16-NEXT:    s_endpgm
148;
149; GFX11-FAKE16-LABEL: fptoui_f16_to_i32:
150; GFX11-FAKE16:       ; %bb.0: ; %entry
151; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
152; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
153; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
154; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
155; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
156; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
158; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
159; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
160; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
161; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
162; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
163; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
164; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
165; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
166; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
167; GFX11-FAKE16-NEXT:    s_endpgm
168    ptr addrspace(1) %r,
169    ptr addrspace(1) %a) {
170entry:
171  %a.val = load half, ptr addrspace(1) %a
172  %r.val = fptoui half %a.val to i32
173  store i32 %r.val, ptr addrspace(1) %r
174  ret void
175}
176
177; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
178; test checks code generated for 'i64 = fp_to_uint f32'.
179
180define amdgpu_kernel void @fptoui_f16_to_i64(
181; SI-LABEL: fptoui_f16_to_i64:
182; SI:       ; %bb.0: ; %entry
183; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
184; SI-NEXT:    s_mov_b32 s7, 0xf000
185; SI-NEXT:    s_mov_b32 s6, -1
186; SI-NEXT:    s_mov_b32 s10, s6
187; SI-NEXT:    s_mov_b32 s11, s7
188; SI-NEXT:    s_waitcnt lgkmcnt(0)
189; SI-NEXT:    s_mov_b32 s8, s2
190; SI-NEXT:    s_mov_b32 s9, s3
191; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
192; SI-NEXT:    s_mov_b32 s4, s0
193; SI-NEXT:    s_mov_b32 s5, s1
194; SI-NEXT:    v_mov_b32_e32 v1, 0
195; SI-NEXT:    s_waitcnt vmcnt(0)
196; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
197; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
198; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
199; SI-NEXT:    s_endpgm
200;
201; VI-LABEL: fptoui_f16_to_i64:
202; VI:       ; %bb.0: ; %entry
203; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
204; VI-NEXT:    s_mov_b32 s7, 0xf000
205; VI-NEXT:    s_mov_b32 s6, -1
206; VI-NEXT:    s_mov_b32 s10, s6
207; VI-NEXT:    s_mov_b32 s11, s7
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    s_mov_b32 s8, s2
210; VI-NEXT:    s_mov_b32 s9, s3
211; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
212; VI-NEXT:    s_mov_b32 s4, s0
213; VI-NEXT:    s_mov_b32 s5, s1
214; VI-NEXT:    v_mov_b32_e32 v1, 0
215; VI-NEXT:    s_waitcnt vmcnt(0)
216; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
217; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
218; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
219; VI-NEXT:    s_endpgm
220;
221; GFX11-TRUE16-LABEL: fptoui_f16_to_i64:
222; GFX11-TRUE16:       ; %bb.0: ; %entry
223; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
224; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
225; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
226; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
227; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
228; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
229; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
231; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
232; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
233; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
234; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
235; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
236; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
237; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
238; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
239; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
240; GFX11-TRUE16-NEXT:    s_endpgm
241;
242; GFX11-FAKE16-LABEL: fptoui_f16_to_i64:
243; GFX11-FAKE16:       ; %bb.0: ; %entry
244; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
245; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
246; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
247; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
248; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
249; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
250; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
252; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
253; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
254; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
255; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
256; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
257; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
258; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
259; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
260; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
261; GFX11-FAKE16-NEXT:    s_endpgm
262    ptr addrspace(1) %r,
263    ptr addrspace(1) %a) {
264entry:
265  %a.val = load half, ptr addrspace(1) %a
266  %r.val = fptoui half %a.val to i64
267  store i64 %r.val, ptr addrspace(1) %r
268  ret void
269}
270
271define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
272; SI-LABEL: fptoui_v2f16_to_v2i16:
273; SI:       ; %bb.0: ; %entry
274; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
275; SI-NEXT:    s_mov_b32 s7, 0xf000
276; SI-NEXT:    s_mov_b32 s6, -1
277; SI-NEXT:    s_mov_b32 s10, s6
278; SI-NEXT:    s_mov_b32 s11, s7
279; SI-NEXT:    s_waitcnt lgkmcnt(0)
280; SI-NEXT:    s_mov_b32 s8, s2
281; SI-NEXT:    s_mov_b32 s9, s3
282; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
283; SI-NEXT:    s_mov_b32 s4, s0
284; SI-NEXT:    s_mov_b32 s5, s1
285; SI-NEXT:    s_waitcnt vmcnt(0)
286; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
287; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
288; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
289; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
290; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
291; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
292; SI-NEXT:    v_or_b32_e32 v0, v0, v1
293; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
294; SI-NEXT:    s_endpgm
295;
296; VI-LABEL: fptoui_v2f16_to_v2i16:
297; VI:       ; %bb.0: ; %entry
298; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
299; VI-NEXT:    s_mov_b32 s7, 0xf000
300; VI-NEXT:    s_mov_b32 s6, -1
301; VI-NEXT:    s_mov_b32 s10, s6
302; VI-NEXT:    s_mov_b32 s11, s7
303; VI-NEXT:    s_waitcnt lgkmcnt(0)
304; VI-NEXT:    s_mov_b32 s8, s2
305; VI-NEXT:    s_mov_b32 s9, s3
306; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
307; VI-NEXT:    s_mov_b32 s4, s0
308; VI-NEXT:    s_mov_b32 s5, s1
309; VI-NEXT:    s_waitcnt vmcnt(0)
310; VI-NEXT:    v_cvt_u16_f16_e32 v1, v0
311; VI-NEXT:    v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
312; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
313; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
314; VI-NEXT:    s_endpgm
315;
316; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16:
317; GFX11-TRUE16:       ; %bb.0: ; %entry
318; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
319; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
320; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
321; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
322; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
323; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
325; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
326; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
327; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
328; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
329; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
330; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
331; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
332; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
333; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.h, v1.l
334; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
335; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
336; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
337; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
338; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
339; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
340; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
341; GFX11-TRUE16-NEXT:    s_endpgm
342;
343; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16:
344; GFX11-FAKE16:       ; %bb.0: ; %entry
345; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
346; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
347; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
348; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
349; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
350; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
352; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
353; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
354; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
355; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
356; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
357; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
358; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v0, v0
359; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
360; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v1, v1
361; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
362; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
363; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
364; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
365; GFX11-FAKE16-NEXT:    s_endpgm
366    ptr addrspace(1) %r,
367    ptr addrspace(1) %a) {
368entry:
369  %a.val = load <2 x half>, ptr addrspace(1) %a
370  %r.val = fptoui <2 x half> %a.val to <2 x i16>
371  store <2 x i16> %r.val, ptr addrspace(1) %r
372  ret void
373}
374
375define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
376; SI-LABEL: fptoui_v2f16_to_v2i32:
377; SI:       ; %bb.0: ; %entry
378; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
379; SI-NEXT:    s_mov_b32 s7, 0xf000
380; SI-NEXT:    s_mov_b32 s6, -1
381; SI-NEXT:    s_mov_b32 s10, s6
382; SI-NEXT:    s_mov_b32 s11, s7
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_mov_b32 s8, s2
385; SI-NEXT:    s_mov_b32 s9, s3
386; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
387; SI-NEXT:    s_mov_b32 s4, s0
388; SI-NEXT:    s_mov_b32 s5, s1
389; SI-NEXT:    s_waitcnt vmcnt(0)
390; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
391; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
392; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
393; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
394; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
395; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
396; SI-NEXT:    s_endpgm
397;
398; VI-LABEL: fptoui_v2f16_to_v2i32:
399; VI:       ; %bb.0: ; %entry
400; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
401; VI-NEXT:    s_mov_b32 s7, 0xf000
402; VI-NEXT:    s_mov_b32 s6, -1
403; VI-NEXT:    s_mov_b32 s10, s6
404; VI-NEXT:    s_mov_b32 s11, s7
405; VI-NEXT:    s_waitcnt lgkmcnt(0)
406; VI-NEXT:    s_mov_b32 s8, s2
407; VI-NEXT:    s_mov_b32 s9, s3
408; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
409; VI-NEXT:    s_mov_b32 s4, s0
410; VI-NEXT:    s_mov_b32 s5, s1
411; VI-NEXT:    s_waitcnt vmcnt(0)
412; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
413; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
414; VI-NEXT:    v_cvt_u32_f32_e32 v0, v1
415; VI-NEXT:    v_cvt_u32_f32_e32 v1, v2
416; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
417; VI-NEXT:    s_endpgm
418;
419; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32:
420; GFX11-TRUE16:       ; %bb.0: ; %entry
421; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
422; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
423; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
424; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
425; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
426; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
428; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
429; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
430; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
431; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
432; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
433; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
434; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
435; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
436; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
437; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
438; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
439; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
440; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
441; GFX11-TRUE16-NEXT:    s_endpgm
442;
443; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32:
444; GFX11-FAKE16:       ; %bb.0: ; %entry
445; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
446; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
447; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
448; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
449; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
450; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
452; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
453; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
454; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
455; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
456; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
457; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
458; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
459; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
460; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
461; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
462; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
463; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
464; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
465; GFX11-FAKE16-NEXT:    s_endpgm
466    ptr addrspace(1) %r,
467    ptr addrspace(1) %a) {
468entry:
469  %a.val = load <2 x half>, ptr addrspace(1) %a
470  %r.val = fptoui <2 x half> %a.val to <2 x i32>
471  store <2 x i32> %r.val, ptr addrspace(1) %r
472  ret void
473}
474
475; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
476; test checks code generated for 'i64 = fp_to_uint f32'.
477
478define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
479; SI-LABEL: fptoui_v2f16_to_v2i64:
480; SI:       ; %bb.0: ; %entry
481; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
482; SI-NEXT:    s_mov_b32 s7, 0xf000
483; SI-NEXT:    s_mov_b32 s6, -1
484; SI-NEXT:    s_mov_b32 s10, s6
485; SI-NEXT:    s_mov_b32 s11, s7
486; SI-NEXT:    s_waitcnt lgkmcnt(0)
487; SI-NEXT:    s_mov_b32 s8, s2
488; SI-NEXT:    s_mov_b32 s9, s3
489; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
490; SI-NEXT:    s_mov_b32 s4, s0
491; SI-NEXT:    s_mov_b32 s5, s1
492; SI-NEXT:    s_waitcnt vmcnt(0)
493; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
494; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
495; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
496; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
497; SI-NEXT:    v_cvt_u32_f32_e32 v2, v1
498; SI-NEXT:    v_mov_b32_e32 v1, 0
499; SI-NEXT:    v_mov_b32_e32 v3, v1
500; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
501; SI-NEXT:    s_endpgm
502;
503; VI-LABEL: fptoui_v2f16_to_v2i64:
504; VI:       ; %bb.0: ; %entry
505; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
506; VI-NEXT:    s_mov_b32 s7, 0xf000
507; VI-NEXT:    s_mov_b32 s6, -1
508; VI-NEXT:    s_mov_b32 s10, s6
509; VI-NEXT:    s_mov_b32 s11, s7
510; VI-NEXT:    s_waitcnt lgkmcnt(0)
511; VI-NEXT:    s_mov_b32 s8, s2
512; VI-NEXT:    s_mov_b32 s9, s3
513; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
514; VI-NEXT:    s_mov_b32 s4, s0
515; VI-NEXT:    s_mov_b32 s5, s1
516; VI-NEXT:    s_waitcnt vmcnt(0)
517; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
518; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
519; VI-NEXT:    v_cvt_u32_f32_e32 v0, v1
520; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
521; VI-NEXT:    v_mov_b32_e32 v1, 0
522; VI-NEXT:    v_mov_b32_e32 v3, v1
523; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
524; VI-NEXT:    s_endpgm
525;
526; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64:
527; GFX11-TRUE16:       ; %bb.0: ; %entry
528; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
529; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
530; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
531; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
532; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
533; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
535; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
536; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
537; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
538; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
539; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
540; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
541; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
542; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
543; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
544; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
545; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
546; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
547; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
548; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
549; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
550; GFX11-TRUE16-NEXT:    s_endpgm
551;
552; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64:
553; GFX11-FAKE16:       ; %bb.0: ; %entry
554; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
555; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
556; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
557; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
558; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
559; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
561; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
562; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
563; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
564; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
565; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
566; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
567; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
568; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
569; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
570; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
571; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
572; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
573; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
574; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
575; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
576; GFX11-FAKE16-NEXT:    s_endpgm
577    ptr addrspace(1) %r,
578    ptr addrspace(1) %a) {
579entry:
580  %a.val = load <2 x half>, ptr addrspace(1) %a
581  %r.val = fptoui <2 x half> %a.val to <2 x i64>
582  store <2 x i64> %r.val, ptr addrspace(1) %r
583  ret void
584}
585
586define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
587; SI-LABEL: fptoui_f16_to_i1:
588; SI:       ; %bb.0: ; %entry
589; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
590; SI-NEXT:    s_mov_b32 s3, 0xf000
591; SI-NEXT:    s_mov_b32 s2, -1
592; SI-NEXT:    s_waitcnt lgkmcnt(0)
593; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
594; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
595; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 1.0, v0
596; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
597; SI-NEXT:    s_waitcnt lgkmcnt(0)
598; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
599; SI-NEXT:    s_endpgm
600;
601; VI-LABEL: fptoui_f16_to_i1:
602; VI:       ; %bb.0: ; %entry
603; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
604; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
605; VI-NEXT:    s_mov_b32 s3, 0xf000
606; VI-NEXT:    s_mov_b32 s2, -1
607; VI-NEXT:    s_waitcnt lgkmcnt(0)
608; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], 1.0, s6
609; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
610; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
611; VI-NEXT:    s_endpgm
612;
613; GFX11-TRUE16-LABEL: fptoui_f16_to_i1:
614; GFX11-TRUE16:       ; %bb.0: ; %entry
615; GFX11-TRUE16-NEXT:    s_clause 0x1
616; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
617; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
618; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
619; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
620; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
621; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
622; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
623; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
624; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
625; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
626; GFX11-TRUE16-NEXT:    s_endpgm
627;
628; GFX11-FAKE16-LABEL: fptoui_f16_to_i1:
629; GFX11-FAKE16:       ; %bb.0: ; %entry
630; GFX11-FAKE16-NEXT:    s_clause 0x1
631; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
632; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
633; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
634; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, 1.0, s2
636; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
637; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
638; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
639; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
640; GFX11-FAKE16-NEXT:    s_endpgm
641entry:
642  %conv = fptoui half %in to i1
643  store i1 %conv, ptr addrspace(1) %out
644  ret void
645}
646