xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll (revision 24267a7e14b35f41ab55e15ba12bb80c82881941)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
5
6define amdgpu_gs half @v_fptrunc_round_f32_to_f16_tonearest(float %a) {
7; CHECK-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
8; CHECK:       ; %bb.0:
9; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
10; CHECK-NEXT:    ; return to shader part epilog
11  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearest")
12  ret half %res
13}
14
15define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
16; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
17; CHECK:       ; %bb.0:
18; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
19; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
20; CHECK-NEXT:    ; return to shader part epilog
21  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
22  ret half %res
23}
24
25define amdgpu_gs half @v_fptrunc_round_f32_to_f16_downward(float %a) {
26; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward:
27; CHECK:       ; %bb.0:
28; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
29; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
30; CHECK-NEXT:    ; return to shader part epilog
31  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
32  ret half %res
33}
34
35define amdgpu_gs half @v_fptrunc_round_f32_to_f16_towardzero(float %a) {
36; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
37; CHECK:       ; %bb.0:
38; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
39; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
40; CHECK-NEXT:    ; return to shader part epilog
41  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
42  ret half %res
43}
44
45define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
46; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
47; CHECK:       ; %bb.0:
48; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
49; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
50; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v1
51; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
52; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
53; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
54; CHECK-NEXT:    v_add_f16_e32 v0, v0, v4
55; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
56; CHECK-NEXT:    global_store_short v[2:3], v0, off
57; CHECK-NEXT:    s_endpgm
58  %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
59  %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
60  %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
61  %res4 = fadd half %res1, %res2
62  %res5 = fadd half %res3, %res4
63  store half %res5, ptr addrspace(1) %out, align 4
64  ret void
65}
66
67define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
68; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
69; CHECK:       ; %bb.0:
70; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
71; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v0
72; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
73; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
74; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
75; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
76; CHECK-NEXT:    v_add_f16_e32 v0, v4, v0
77; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
78; CHECK-NEXT:    global_store_short v[2:3], v0, off
79; CHECK-NEXT:    s_endpgm
80  %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
81  %res2 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
82  %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
83  %res4 = fadd half %res1, %res2
84  %res5 = fadd half %res3, %res4
85  store half %res5, ptr addrspace(1) %out, align 4
86  ret void
87}
88
89define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
90; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
91; CHECK:       ; %bb.0:
92; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
93; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
94; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v1
95; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
96; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
97; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0
98; CHECK-NEXT:    v_add_f16_e32 v0, v0, v4
99; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
100; CHECK-NEXT:    global_store_short v[2:3], v0, off
101; CHECK-NEXT:    s_endpgm
102  %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
103  %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.towardzero")
104  %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
105  %res4 = fadd half %res1, %res2
106  %res5 = fadd half %res3, %res4
107  store half %res5, ptr addrspace(1) %out, align 4
108  ret void
109}
110
111define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addrspace(1) %out) {
112; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward:
113; CHECK:       ; %bb.0:
114; CHECK-NEXT:    v_mov_b32_e32 v0, s0
115; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
116; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
117; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
118; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
119; CHECK-NEXT:    ; return to shader part epilog
120  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
121  %bitcast = bitcast half %res to i16
122  %ret = zext i16 %bitcast to i32
123  ret i32 %ret
124}
125
126define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr addrspace(1) %out) {
127; CHECK-LABEL: s_fptrunc_round_f32_to_f16_downward:
128; CHECK:       ; %bb.0:
129; CHECK-NEXT:    v_mov_b32_e32 v0, s0
130; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
131; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
132; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
133; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
134; CHECK-NEXT:    ; return to shader part epilog
135  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
136  %bitcast = bitcast half %res to i16
137  %ret = zext i16 %bitcast to i32
138  ret i32 %ret
139}
140
141define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float inreg %a, float inreg %b, ptr addrspace(1) %out) {
142; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
143; CHECK:       ; %bb.0:
144; CHECK-NEXT:    v_mov_b32_e32 v2, s0
145; CHECK-NEXT:    v_mov_b32_e32 v3, s1
146; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
147; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
148; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v3
149; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
150; CHECK-NEXT:    v_cvt_f16_f32_e32 v3, v3
151; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
152; CHECK-NEXT:    v_add_f16_e32 v2, v2, v4
153; CHECK-NEXT:    v_add_f16_e32 v2, v3, v2
154; CHECK-NEXT:    global_store_short v[0:1], v2, off
155; CHECK-NEXT:    s_endpgm
156  %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
157  %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
158  %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
159  %res4 = fadd half %res1, %res2
160  %res5 = fadd half %res3, %res4
161  store half %res5, ptr addrspace(1) %out, align 4
162  ret void
163}
164
165define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %a) {
166; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
167; SDAG:       ; %bb.0:
168; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
169; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
170; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
171; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
172; SDAG-NEXT:    ; return to shader part epilog
173;
174; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
175; GISEL:       ; %bb.0:
176; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
177; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
178; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
179; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
180; GISEL-NEXT:    ; return to shader part epilog
181  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
182  ret <2 x half> %res
183}
184
185define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> %a) {
186; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
187; SDAG:       ; %bb.0:
188; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
189; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
190; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
191; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
192; SDAG-NEXT:    ; return to shader part epilog
193;
194; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
195; GISEL:       ; %bb.0:
196; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
197; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
198; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
199; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
200; GISEL-NEXT:    ; return to shader part epilog
201  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
202  ret <2 x half> %res
203}
204
205define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> %a, <2 x float> %b, ptr addrspace(1) %out) {
206; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
207; SDAG:       ; %bb.0:
208; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
209; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
210; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
211; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v2
212; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v3
213; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
214; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
215; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
216; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v3
217; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
218; SDAG-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
219; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
220; SDAG-NEXT:    v_pk_add_f16 v0, v0, v3
221; SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
222; SDAG-NEXT:    global_store_dword v[4:5], v0, off
223; SDAG-NEXT:    s_endpgm
224;
225; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
226; GISEL:       ; %bb.0:
227; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
228; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
229; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
230; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
231; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v3
232; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
233; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
234; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
235; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v3
236; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
237; GISEL-NEXT:    v_pack_b32_f16 v1, v1, v2
238; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
239; GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
240; GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
241; GISEL-NEXT:    global_store_dword v[4:5], v0, off
242; GISEL-NEXT:    s_endpgm
243  %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
244  %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
245  %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
246  %res4 = fadd <2 x half> %res1, %res2
247  %res5 = fadd <2 x half> %res3, %res4
248  store <2 x half> %res5, ptr addrspace(1) %out, align 4
249  ret void
250}
251
252define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) {
253; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
254; CHECK:       ; %bb.0:
255; CHECK-NEXT:    v_mov_b32_e32 v0, s0
256; CHECK-NEXT:    v_mov_b32_e32 v1, s1
257; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
258; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
259; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
260; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
261; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
262; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
263; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
264; CHECK-NEXT:    ; return to shader part epilog
265  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
266  %bitcast = bitcast <2 x half> %res to <2 x i16>
267  %ret = zext <2 x i16> %bitcast to <2 x i32>
268  ret <2 x i32> %ret
269}
270
271define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) {
272; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
273; CHECK:       ; %bb.0:
274; CHECK-NEXT:    v_mov_b32_e32 v0, s0
275; CHECK-NEXT:    v_mov_b32_e32 v1, s1
276; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
277; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
278; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
279; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
280; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
281; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
282; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
283; CHECK-NEXT:    ; return to shader part epilog
284  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
285  %bitcast = bitcast <2 x half> %res to <2 x i16>
286  %ret = zext <2 x i16> %bitcast to <2 x i32>
287  ret <2 x i32> %ret
288}
289
290define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
291; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
292; SDAG:       ; %bb.0:
293; SDAG-NEXT:    v_mov_b32_e32 v2, s0
294; SDAG-NEXT:    v_mov_b32_e32 v3, s2
295; SDAG-NEXT:    v_mov_b32_e32 v4, s1
296; SDAG-NEXT:    v_mov_b32_e32 v5, s3
297; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
298; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
299; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v3
300; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
301; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v5
302; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
303; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
304; SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
305; SDAG-NEXT:    v_and_b32_e32 v6, 0xffff, v6
306; SDAG-NEXT:    v_and_b32_e32 v3, 0xffff, v3
307; SDAG-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
308; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v5
309; SDAG-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
310; SDAG-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
311; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
312; SDAG-NEXT:    v_pk_add_f16 v2, v2, v5
313; SDAG-NEXT:    v_pk_add_f16 v2, v3, v2
314; SDAG-NEXT:    global_store_dword v[0:1], v2, off
315; SDAG-NEXT:    s_endpgm
316;
317; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
318; GISEL:       ; %bb.0:
319; GISEL-NEXT:    v_mov_b32_e32 v2, s0
320; GISEL-NEXT:    v_mov_b32_e32 v3, s1
321; GISEL-NEXT:    v_mov_b32_e32 v4, s2
322; GISEL-NEXT:    v_mov_b32_e32 v5, s3
323; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
324; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
325; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
326; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v4
327; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v5
328; GISEL-NEXT:    v_pack_b32_f16 v2, v2, v3
329; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
330; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v4
331; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v5
332; GISEL-NEXT:    v_pack_b32_f16 v5, v6, v7
333; GISEL-NEXT:    v_pack_b32_f16 v3, v3, v4
334; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
335; GISEL-NEXT:    v_pk_add_f16 v2, v2, v5
336; GISEL-NEXT:    v_pk_add_f16 v2, v3, v2
337; GISEL-NEXT:    global_store_dword v[0:1], v2, off
338; GISEL-NEXT:    s_endpgm
339  %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
340  %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
341  %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
342  %res4 = fadd <2 x half> %res1, %res2
343  %res5 = fadd <2 x half> %res3, %res4
344  store <2 x half> %res5, ptr addrspace(1) %out, align 4
345  ret void
346}
347
348define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) {
349; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
350; SDAG:       ; %bb.0:
351; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
352; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
353; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
354; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
355; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
356; SDAG-NEXT:    ; return to shader part epilog
357;
358; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
359; GISEL:       ; %bb.0:
360; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
361; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
362; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
363; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
364; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
365; GISEL-NEXT:    ; return to shader part epilog
366  %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
367  ret <3 x half> %res
368}
369
370define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) {
371; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
372; SDAG:       ; %bb.0:
373; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
374; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
375; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
376; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
377; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
378; SDAG-NEXT:    ; return to shader part epilog
379;
380; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
381; GISEL:       ; %bb.0:
382; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
383; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
384; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
385; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
386; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
387; GISEL-NEXT:    ; return to shader part epilog
388  %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
389  ret <3 x half> %res
390}
391
392define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) {
393; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
394; SDAG:       ; %bb.0:
395; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
396; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
397; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
398; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
399; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
400; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
401; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
402; SDAG-NEXT:    ; return to shader part epilog
403;
404; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
405; GISEL:       ; %bb.0:
406; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
407; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
408; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
409; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
410; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
411; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
412; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
413; GISEL-NEXT:    ; return to shader part epilog
414  %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
415  ret <4 x half> %res
416}
417
418define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> %a) {
419; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
420; SDAG:       ; %bb.0:
421; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
422; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
423; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
424; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
425; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
426; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
427; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
428; SDAG-NEXT:    ; return to shader part epilog
429;
430; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
431; GISEL:       ; %bb.0:
432; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
433; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
434; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
435; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
436; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
437; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
438; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
439; GISEL-NEXT:    ; return to shader part epilog
440  %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
441  ret <4 x half> %res
442}
443
444define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %a) {
445; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
446; SDAG:       ; %bb.0:
447; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
448; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v6
449; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
450; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
451; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
452; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
453; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
454; SDAG-NEXT:    v_cvt_f16_f32_e32 v5, v5
455; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v7
456; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
457; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
458; SDAG-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
459; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
460; SDAG-NEXT:    ; return to shader part epilog
461;
462; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
463; GISEL:       ; %bb.0:
464; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
465; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
466; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
467; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
468; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
469; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
470; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
471; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
472; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
473; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
474; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
475; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
476; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
477; GISEL-NEXT:    ; return to shader part epilog
478  %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
479  ret <8 x half> %res
480}
481
482define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %a) {
483; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
484; SDAG:       ; %bb.0:
485; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
486; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v6
487; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
488; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
489; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
490; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
491; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
492; SDAG-NEXT:    v_cvt_f16_f32_e32 v5, v5
493; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v7
494; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
495; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
496; SDAG-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
497; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
498; SDAG-NEXT:    ; return to shader part epilog
499;
500; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
501; GISEL:       ; %bb.0:
502; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
503; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
504; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
505; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
506; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
507; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
508; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
509; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
510; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
511; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
512; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
513; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
514; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
515; GISEL-NEXT:    ; return to shader part epilog
516  %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
517  ret <8 x half> %res
518}
519
520define amdgpu_gs float @v_fptrunc_round_f64_to_f32_tonearest(double %a) {
521; CHECK-LABEL: v_fptrunc_round_f64_to_f32_tonearest:
522; CHECK:       ; %bb.0:
523; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
524; CHECK-NEXT:    ; return to shader part epilog
525  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearest")
526  ret float %res
527}
528
529define amdgpu_gs float @v_fptrunc_round_f64_to_f32_upward(double %a) {
530; CHECK-LABEL: v_fptrunc_round_f64_to_f32_upward:
531; CHECK:       ; %bb.0:
532; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
533; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
534; CHECK-NEXT:    ; return to shader part epilog
535  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.upward")
536  ret float %res
537}
538
539define amdgpu_gs float @v_fptrunc_round_f64_to_f32_downward(double %a) {
540; CHECK-LABEL: v_fptrunc_round_f64_to_f32_downward:
541; CHECK:       ; %bb.0:
542; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
543; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
544; CHECK-NEXT:    ; return to shader part epilog
545  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward")
546  ret float %res
547}
548
549define amdgpu_gs float @v_fptrunc_round_f64_to_f32_towardzero(double %a) {
550; CHECK-LABEL: v_fptrunc_round_f64_to_f32_towardzero:
551; CHECK:       ; %bb.0:
552; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
553; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
554; CHECK-NEXT:    ; return to shader part epilog
555  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.towardzero")
556  ret float %res
557}
558