xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
6
7
8define amdgpu_kernel void @fptosi_f16_to_i16(
9; SI-LABEL: fptosi_f16_to_i16:
10; SI:       ; %bb.0: ; %entry
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
24; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
25; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: fptosi_f16_to_i16:
29; VI:       ; %bb.0: ; %entry
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_cvt_i16_f16_e32 v0, v0
43; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
44; VI-NEXT:    s_endpgm
45;
46; GFX11-TRUE16-LABEL: fptosi_f16_to_i16:
47; GFX11-TRUE16:       ; %bb.0: ; %entry
48; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
49; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
50; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
51; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
52; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
53; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
55; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
56; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
57; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
58; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
59; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
60; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
61; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
62; GFX11-TRUE16-NEXT:    s_endpgm
63;
64; GFX11-FAKE16-LABEL: fptosi_f16_to_i16:
65; GFX11-FAKE16:       ; %bb.0: ; %entry
66; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
67; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
68; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
69; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
70; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
71; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
73; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
74; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
75; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
76; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
77; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
78; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v0, v0
79; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
80; GFX11-FAKE16-NEXT:    s_endpgm
81    ptr addrspace(1) %r,
82    ptr addrspace(1) %a) {
83entry:
84  %a.val = load half, ptr addrspace(1) %a
85  %r.val = fptosi half %a.val to i16
86  store i16 %r.val, ptr addrspace(1) %r
87  ret void
88}
89
90define amdgpu_kernel void @fptosi_f16_to_i32(
91; SI-LABEL: fptosi_f16_to_i32:
92; SI:       ; %bb.0: ; %entry
93; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
94; SI-NEXT:    s_mov_b32 s7, 0xf000
95; SI-NEXT:    s_mov_b32 s6, -1
96; SI-NEXT:    s_mov_b32 s10, s6
97; SI-NEXT:    s_mov_b32 s11, s7
98; SI-NEXT:    s_waitcnt lgkmcnt(0)
99; SI-NEXT:    s_mov_b32 s8, s2
100; SI-NEXT:    s_mov_b32 s9, s3
101; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
102; SI-NEXT:    s_mov_b32 s4, s0
103; SI-NEXT:    s_mov_b32 s5, s1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
106; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
107; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: fptosi_f16_to_i32:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
113; VI-NEXT:    s_mov_b32 s7, 0xf000
114; VI-NEXT:    s_mov_b32 s6, -1
115; VI-NEXT:    s_mov_b32 s10, s6
116; VI-NEXT:    s_mov_b32 s11, s7
117; VI-NEXT:    s_waitcnt lgkmcnt(0)
118; VI-NEXT:    s_mov_b32 s8, s2
119; VI-NEXT:    s_mov_b32 s9, s3
120; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
121; VI-NEXT:    s_mov_b32 s4, s0
122; VI-NEXT:    s_mov_b32 s5, s1
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
125; VI-NEXT:    v_cvt_i32_f32_e32 v0, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; VI-NEXT:    s_endpgm
128;
129; GFX11-TRUE16-LABEL: fptosi_f16_to_i32:
130; GFX11-TRUE16:       ; %bb.0: ; %entry
131; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
132; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
133; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
134; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
135; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
136; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
138; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
139; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
140; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
141; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
142; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
144; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
145; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
146; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
147; GFX11-TRUE16-NEXT:    s_endpgm
148;
149; GFX11-FAKE16-LABEL: fptosi_f16_to_i32:
150; GFX11-FAKE16:       ; %bb.0: ; %entry
151; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
152; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
153; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
154; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
155; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
156; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
158; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
159; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
160; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
161; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
162; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
163; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
164; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
165; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
166; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
167; GFX11-FAKE16-NEXT:    s_endpgm
168    ptr addrspace(1) %r,
169    ptr addrspace(1) %a) {
170entry:
171  %a.val = load half, ptr addrspace(1) %a
172  %r.val = fptosi half %a.val to i32
173  store i32 %r.val, ptr addrspace(1) %r
174  ret void
175}
176
177; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
178; test checks code generated for 'i64 = fp_to_sint f32'.
179
180define amdgpu_kernel void @fptosi_f16_to_i64(
181; SI-LABEL: fptosi_f16_to_i64:
182; SI:       ; %bb.0: ; %entry
183; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
184; SI-NEXT:    s_mov_b32 s7, 0xf000
185; SI-NEXT:    s_mov_b32 s6, -1
186; SI-NEXT:    s_mov_b32 s10, s6
187; SI-NEXT:    s_mov_b32 s11, s7
188; SI-NEXT:    s_waitcnt lgkmcnt(0)
189; SI-NEXT:    s_mov_b32 s8, s2
190; SI-NEXT:    s_mov_b32 s9, s3
191; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
192; SI-NEXT:    s_mov_b32 s4, s0
193; SI-NEXT:    s_mov_b32 s5, s1
194; SI-NEXT:    s_waitcnt vmcnt(0)
195; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
196; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
197; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
198; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
199; SI-NEXT:    s_endpgm
200;
201; VI-LABEL: fptosi_f16_to_i64:
202; VI:       ; %bb.0: ; %entry
203; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
204; VI-NEXT:    s_mov_b32 s7, 0xf000
205; VI-NEXT:    s_mov_b32 s6, -1
206; VI-NEXT:    s_mov_b32 s10, s6
207; VI-NEXT:    s_mov_b32 s11, s7
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    s_mov_b32 s8, s2
210; VI-NEXT:    s_mov_b32 s9, s3
211; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
212; VI-NEXT:    s_mov_b32 s4, s0
213; VI-NEXT:    s_mov_b32 s5, s1
214; VI-NEXT:    s_waitcnt vmcnt(0)
215; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
216; VI-NEXT:    v_cvt_i32_f32_e32 v0, v0
217; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
218; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
219; VI-NEXT:    s_endpgm
220;
221; GFX11-TRUE16-LABEL: fptosi_f16_to_i64:
222; GFX11-TRUE16:       ; %bb.0: ; %entry
223; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
224; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
225; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
226; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
227; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
228; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
230; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
231; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
232; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
233; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
234; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
235; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
236; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
237; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
238; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
239; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
240; GFX11-TRUE16-NEXT:    s_endpgm
241;
242; GFX11-FAKE16-LABEL: fptosi_f16_to_i64:
243; GFX11-FAKE16:       ; %bb.0: ; %entry
244; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
245; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
246; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
247; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
248; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
249; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
251; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
252; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
253; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
254; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
255; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
256; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
257; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
258; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
259; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
260; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
261; GFX11-FAKE16-NEXT:    s_endpgm
262    ptr addrspace(1) %r,
263    ptr addrspace(1) %a) {
264entry:
265  %a.val = load half, ptr addrspace(1) %a
266  %r.val = fptosi half %a.val to i64
267  store i64 %r.val, ptr addrspace(1) %r
268  ret void
269}
270
271define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
272; SI-LABEL: fptosi_v2f16_to_v2i16:
273; SI:       ; %bb.0: ; %entry
274; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
275; SI-NEXT:    s_mov_b32 s7, 0xf000
276; SI-NEXT:    s_mov_b32 s6, -1
277; SI-NEXT:    s_mov_b32 s10, s6
278; SI-NEXT:    s_mov_b32 s11, s7
279; SI-NEXT:    s_waitcnt lgkmcnt(0)
280; SI-NEXT:    s_mov_b32 s8, s2
281; SI-NEXT:    s_mov_b32 s9, s3
282; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
283; SI-NEXT:    s_mov_b32 s4, s0
284; SI-NEXT:    s_mov_b32 s5, s1
285; SI-NEXT:    s_waitcnt vmcnt(0)
286; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
287; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
288; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
289; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
290; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
291; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
292; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
293; SI-NEXT:    v_or_b32_e32 v0, v0, v1
294; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
295; SI-NEXT:    s_endpgm
296;
297; VI-LABEL: fptosi_v2f16_to_v2i16:
298; VI:       ; %bb.0: ; %entry
299; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
300; VI-NEXT:    s_mov_b32 s7, 0xf000
301; VI-NEXT:    s_mov_b32 s6, -1
302; VI-NEXT:    s_mov_b32 s10, s6
303; VI-NEXT:    s_mov_b32 s11, s7
304; VI-NEXT:    s_waitcnt lgkmcnt(0)
305; VI-NEXT:    s_mov_b32 s8, s2
306; VI-NEXT:    s_mov_b32 s9, s3
307; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
308; VI-NEXT:    s_mov_b32 s4, s0
309; VI-NEXT:    s_mov_b32 s5, s1
310; VI-NEXT:    s_waitcnt vmcnt(0)
311; VI-NEXT:    v_cvt_i16_f16_e32 v1, v0
312; VI-NEXT:    v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
313; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
314; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
315; VI-NEXT:    s_endpgm
316;
317; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16:
318; GFX11-TRUE16:       ; %bb.0: ; %entry
319; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
320; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
321; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
322; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
323; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
324; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
326; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
327; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
328; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
329; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
330; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
331; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
332; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
333; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
334; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.h, v1.l
335; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
336; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
337; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
338; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
339; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
340; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
341; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
342; GFX11-TRUE16-NEXT:    s_endpgm
343;
344; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16:
345; GFX11-FAKE16:       ; %bb.0: ; %entry
346; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
347; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
348; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
349; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
350; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
351; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
353; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
354; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
355; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
356; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
357; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
358; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
359; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v0, v0
360; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
361; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v1, v1
362; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
363; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
364; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
365; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
366; GFX11-FAKE16-NEXT:    s_endpgm
367    ptr addrspace(1) %r,
368    ptr addrspace(1) %a) {
369entry:
370  %a.val = load <2 x half>, ptr addrspace(1) %a
371  %r.val = fptosi <2 x half> %a.val to <2 x i16>
372  store <2 x i16> %r.val, ptr addrspace(1) %r
373  ret void
374}
375
376define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
377; SI-LABEL: fptosi_v2f16_to_v2i32:
378; SI:       ; %bb.0: ; %entry
379; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
380; SI-NEXT:    s_mov_b32 s7, 0xf000
381; SI-NEXT:    s_mov_b32 s6, -1
382; SI-NEXT:    s_mov_b32 s10, s6
383; SI-NEXT:    s_mov_b32 s11, s7
384; SI-NEXT:    s_waitcnt lgkmcnt(0)
385; SI-NEXT:    s_mov_b32 s8, s2
386; SI-NEXT:    s_mov_b32 s9, s3
387; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
388; SI-NEXT:    s_mov_b32 s4, s0
389; SI-NEXT:    s_mov_b32 s5, s1
390; SI-NEXT:    s_waitcnt vmcnt(0)
391; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
392; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
393; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
394; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
395; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
396; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
397; SI-NEXT:    s_endpgm
398;
399; VI-LABEL: fptosi_v2f16_to_v2i32:
400; VI:       ; %bb.0: ; %entry
401; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
402; VI-NEXT:    s_mov_b32 s7, 0xf000
403; VI-NEXT:    s_mov_b32 s6, -1
404; VI-NEXT:    s_mov_b32 s10, s6
405; VI-NEXT:    s_mov_b32 s11, s7
406; VI-NEXT:    s_waitcnt lgkmcnt(0)
407; VI-NEXT:    s_mov_b32 s8, s2
408; VI-NEXT:    s_mov_b32 s9, s3
409; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
410; VI-NEXT:    s_mov_b32 s4, s0
411; VI-NEXT:    s_mov_b32 s5, s1
412; VI-NEXT:    s_waitcnt vmcnt(0)
413; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
414; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
415; VI-NEXT:    v_cvt_i32_f32_e32 v0, v1
416; VI-NEXT:    v_cvt_i32_f32_e32 v1, v2
417; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
418; VI-NEXT:    s_endpgm
419;
420; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32:
421; GFX11-TRUE16:       ; %bb.0: ; %entry
422; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
423; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
424; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
425; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
426; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
427; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
429; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
430; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
431; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
432; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
433; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
434; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
435; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
436; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
437; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
438; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
439; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
440; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
441; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
442; GFX11-TRUE16-NEXT:    s_endpgm
443;
444; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32:
445; GFX11-FAKE16:       ; %bb.0: ; %entry
446; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
447; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
448; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
449; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
450; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
451; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
453; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
454; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
455; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
456; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
457; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
458; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
459; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
460; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
461; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
462; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
463; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
464; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
465; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
466; GFX11-FAKE16-NEXT:    s_endpgm
467    ptr addrspace(1) %r,
468    ptr addrspace(1) %a) {
469entry:
470  %a.val = load <2 x half>, ptr addrspace(1) %a
471  %r.val = fptosi <2 x half> %a.val to <2 x i32>
472  store <2 x i32> %r.val, ptr addrspace(1) %r
473  ret void
474}
475
476; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
477; test checks code generated for 'i64 = fp_to_sint f32'.
478
479define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
480; SI-LABEL: fptosi_v2f16_to_v2i64:
481; SI:       ; %bb.0: ; %entry
482; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
483; SI-NEXT:    s_mov_b32 s7, 0xf000
484; SI-NEXT:    s_mov_b32 s6, -1
485; SI-NEXT:    s_mov_b32 s10, s6
486; SI-NEXT:    s_mov_b32 s11, s7
487; SI-NEXT:    s_waitcnt lgkmcnt(0)
488; SI-NEXT:    s_mov_b32 s8, s2
489; SI-NEXT:    s_mov_b32 s9, s3
490; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
491; SI-NEXT:    s_mov_b32 s4, s0
492; SI-NEXT:    s_mov_b32 s5, s1
493; SI-NEXT:    s_waitcnt vmcnt(0)
494; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
495; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
496; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
497; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
498; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
499; SI-NEXT:    v_cvt_i32_f32_e32 v2, v2
500; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
501; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
502; SI-NEXT:    s_endpgm
503;
504; VI-LABEL: fptosi_v2f16_to_v2i64:
505; VI:       ; %bb.0: ; %entry
506; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
507; VI-NEXT:    s_mov_b32 s7, 0xf000
508; VI-NEXT:    s_mov_b32 s6, -1
509; VI-NEXT:    s_mov_b32 s10, s6
510; VI-NEXT:    s_mov_b32 s11, s7
511; VI-NEXT:    s_waitcnt lgkmcnt(0)
512; VI-NEXT:    s_mov_b32 s8, s2
513; VI-NEXT:    s_mov_b32 s9, s3
514; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
515; VI-NEXT:    s_mov_b32 s4, s0
516; VI-NEXT:    s_mov_b32 s5, s1
517; VI-NEXT:    s_waitcnt vmcnt(0)
518; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
519; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
520; VI-NEXT:    v_cvt_i32_f32_e32 v0, v1
521; VI-NEXT:    v_cvt_i32_f32_e32 v2, v2
522; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
523; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
524; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
525; VI-NEXT:    s_endpgm
526;
527; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64:
528; GFX11-TRUE16:       ; %bb.0: ; %entry
529; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
530; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
531; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
532; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
533; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
534; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
536; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
537; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
538; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
539; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
540; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
541; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
542; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
543; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
544; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
545; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
546; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
547; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v1
548; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
549; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
550; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
551; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
552; GFX11-TRUE16-NEXT:    s_endpgm
553;
554; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64:
555; GFX11-FAKE16:       ; %bb.0: ; %entry
556; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
557; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
558; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
559; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
560; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
561; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
563; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
564; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
565; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
566; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
567; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
568; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
569; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
570; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
571; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
572; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
573; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
574; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v1
575; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
576; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
577; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
578; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
579; GFX11-FAKE16-NEXT:    s_endpgm
580    ptr addrspace(1) %r,
581    ptr addrspace(1) %a) {
582entry:
583  %a.val = load <2 x half>, ptr addrspace(1) %a
584  %r.val = fptosi <2 x half> %a.val to <2 x i64>
585  store <2 x i64> %r.val, ptr addrspace(1) %r
586  ret void
587}
588
589define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
590; SI-LABEL: fptosi_f16_to_i1:
591; SI:       ; %bb.0: ; %entry
592; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
593; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
594; SI-NEXT:    s_mov_b32 s3, 0xf000
595; SI-NEXT:    s_mov_b32 s2, -1
596; SI-NEXT:    s_waitcnt lgkmcnt(0)
597; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
598; SI-NEXT:    v_cmp_eq_f32_e32 vcc, -1.0, v0
599; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
600; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
601; SI-NEXT:    s_endpgm
602;
603; VI-LABEL: fptosi_f16_to_i1:
604; VI:       ; %bb.0: ; %entry
605; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
606; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
607; VI-NEXT:    s_mov_b32 s3, 0xf000
608; VI-NEXT:    s_mov_b32 s2, -1
609; VI-NEXT:    s_waitcnt lgkmcnt(0)
610; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], -1.0, s6
611; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
612; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
613; VI-NEXT:    s_endpgm
614;
615; GFX11-TRUE16-LABEL: fptosi_f16_to_i1:
616; GFX11-TRUE16:       ; %bb.0: ; %entry
617; GFX11-TRUE16-NEXT:    s_clause 0x1
618; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
619; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
620; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
621; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
623; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
624; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
625; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
626; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
627; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
628; GFX11-TRUE16-NEXT:    s_endpgm
629;
630; GFX11-FAKE16-LABEL: fptosi_f16_to_i1:
631; GFX11-FAKE16:       ; %bb.0: ; %entry
632; GFX11-FAKE16-NEXT:    s_clause 0x1
633; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
634; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
635; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
636; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, -1.0, s2
638; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
639; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
640; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
641; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
642; GFX11-FAKE16-NEXT:    s_endpgm
643entry:
644  %conv = fptosi half %in to i1
645  store i1 %conv, ptr addrspace(1) %out
646  ret void
647}
648