xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (revision 9afaf9c6c89efb22bccab39677e8dff47da91a00)
1070d1e83SPravin Jagtap; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2*9afaf9c6SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s
3*9afaf9c6SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
4070d1e83SPravin Jagtap
5070d1e83SPravin Jagtap; TODO: Add global-isel when it can support bf16
6e52a6878SPravin Jagtap
7070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
8070d1e83SPravin Jagtap; GCN-LABEL: v_test_cvt_bf16_f32_v:
9070d1e83SPravin Jagtap; GCN:       ; %bb.0:
10070d1e83SPravin Jagtap; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11070d1e83SPravin Jagtap; GCN-NEXT:    ; return to shader part epilog
12070d1e83SPravin Jagtap  %cvt = fpext bfloat %v to float
13070d1e83SPravin Jagtap  ret float %cvt
14070d1e83SPravin Jagtap}
15e52a6878SPravin Jagtap
16070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
17070d1e83SPravin Jagtap; GCN-LABEL: v_test_cvt_bf16_f32_s:
18070d1e83SPravin Jagtap; GCN:       ; %bb.0:
19070d1e83SPravin Jagtap; GCN-NEXT:    s_lshl_b32 s0, s0, 16
20070d1e83SPravin Jagtap; GCN-NEXT:    v_mov_b32_e32 v0, s0
21070d1e83SPravin Jagtap; GCN-NEXT:    ; return to shader part epilog
22070d1e83SPravin Jagtap  %cvt = fpext bfloat %v to float
23070d1e83SPravin Jagtap  ret float %cvt
24070d1e83SPravin Jagtap}
25e52a6878SPravin Jagtap
26070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
27738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_v:
28738bdd49SMatt Arsenault; GFX-940:       ; %bb.0:
29738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v2, v0, 16, 1
30738bdd49SMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
31738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v2, v2, v0, s0
32738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
33738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
34738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 1
35738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
36738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v2, v1, 16, 1
37738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v2, v2, v1, s0
38738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v1
39738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
40738bdd49SMatt Arsenault; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
41738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 0
42738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
43738bdd49SMatt Arsenault; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
44738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
45738bdd49SMatt Arsenault;
46738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_v:
47738bdd49SMatt Arsenault; GFX-950:       ; %bb.0:
48738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
49738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
50070d1e83SPravin Jagtap  %res = fptrunc <2 x float> %src to <2 x bfloat>
51070d1e83SPravin Jagtap  %cast = bitcast <2 x bfloat> %res to float
52070d1e83SPravin Jagtap  ret float %cast
53070d1e83SPravin Jagtap}
54e52a6878SPravin Jagtap
55070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
56738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_s:
57738bdd49SMatt Arsenault; GFX-940:       ; %bb.0:
58738bdd49SMatt Arsenault; GFX-940-NEXT:    s_bfe_u32 s2, s1, 0x10010
59738bdd49SMatt Arsenault; GFX-940-NEXT:    s_add_i32 s2, s2, s1
60738bdd49SMatt Arsenault; GFX-940-NEXT:    s_or_b32 s4, s1, 0x400000
61738bdd49SMatt Arsenault; GFX-940-NEXT:    s_add_i32 s5, s2, 0x7fff
62738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
63738bdd49SMatt Arsenault; GFX-940-NEXT:    s_and_b64 s[2:3], s[2:3], exec
64738bdd49SMatt Arsenault; GFX-940-NEXT:    s_cselect_b32 s1, s4, s5
65738bdd49SMatt Arsenault; GFX-940-NEXT:    s_lshr_b32 s2, s1, 16
66738bdd49SMatt Arsenault; GFX-940-NEXT:    s_bfe_u32 s1, s0, 0x10010
67738bdd49SMatt Arsenault; GFX-940-NEXT:    s_add_i32 s1, s1, s0
68738bdd49SMatt Arsenault; GFX-940-NEXT:    s_or_b32 s3, s0, 0x400000
69738bdd49SMatt Arsenault; GFX-940-NEXT:    s_add_i32 s4, s1, 0x7fff
70738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
71738bdd49SMatt Arsenault; GFX-940-NEXT:    s_and_b64 s[0:1], s[0:1], exec
72738bdd49SMatt Arsenault; GFX-940-NEXT:    s_cselect_b32 s0, s3, s4
73738bdd49SMatt Arsenault; GFX-940-NEXT:    s_lshr_b32 s0, s0, 16
74738bdd49SMatt Arsenault; GFX-940-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
75738bdd49SMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v0, s0
76738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
77738bdd49SMatt Arsenault;
78738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_s:
79738bdd49SMatt Arsenault; GFX-950:       ; %bb.0:
80738bdd49SMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v0, s1
81738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, s0, v0
82738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
83070d1e83SPravin Jagtap  %res = fptrunc <2 x float> %src to <2 x bfloat>
84070d1e83SPravin Jagtap  %cast = bitcast <2 x bfloat> %res to float
85070d1e83SPravin Jagtap  ret float %cast
86070d1e83SPravin Jagtap}
87e52a6878SPravin Jagtap
88070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
89738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_f32_bf16_v:
90738bdd49SMatt Arsenault; GFX-940:       ; %bb.0:
91738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v1, v0, 16, 1
92738bdd49SMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
93738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v1, v1, v0, s0
94738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v0
95738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
96738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 1
97738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
98738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
99738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
100738bdd49SMatt Arsenault;
101738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_f32_bf16_v:
102738bdd49SMatt Arsenault; GFX-950:       ; %bb.0:
103738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
104738bdd49SMatt Arsenault; GFX-950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
105738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
106070d1e83SPravin Jagtap  %trunc = fptrunc float %src to bfloat
107070d1e83SPravin Jagtap  %ext = fpext bfloat %trunc to float
108070d1e83SPravin Jagtap  ret float %ext
109070d1e83SPravin Jagtap}
110e52a6878SPravin Jagtap
111070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
112738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f64_v2bf16_v:
113738bdd49SMatt Arsenault; GFX-940:       ; %bb.0:
114738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
115738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
116738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v7, 1, v6
117738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
118738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
119738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
120738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
121738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add_u32_e32 v4, v6, v4
122738bdd49SMatt Arsenault; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
123738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
124738bdd49SMatt Arsenault; GFX-940-NEXT:    s_brev_b32 s4, 1
125738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_or_b32 v5, v1, s4, v4
126738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
127738bdd49SMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s5, 0x7fff
128738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s5
129738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
130738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
131738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 1
132738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
133738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
134738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
135738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v6, 1, v5
136738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
137738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
138738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
139738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
140738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add_u32_e32 v0, v5, v0
141738bdd49SMatt Arsenault; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
142738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
143738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_or_b32 v1, v3, s4, v0
144738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v0, v0, 16, 1
145738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v0, v0, v1, s5
146738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
147738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
148738bdd49SMatt Arsenault; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
149738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 0
150738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
151738bdd49SMatt Arsenault; GFX-940-NEXT:    v_perm_b32 v0, v0, v4, s0
152738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
153738bdd49SMatt Arsenault;
154738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
155738bdd49SMatt Arsenault; GFX-950:       ; %bb.0:
156738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
157738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
158738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
159738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
160070d1e83SPravin Jagtap  %res = fptrunc <2 x double> %src to <2 x bfloat>
161070d1e83SPravin Jagtap  %cast = bitcast <2 x bfloat> %res to float
162070d1e83SPravin Jagtap  ret float %cast
163070d1e83SPravin Jagtap}
164e52a6878SPravin Jagtap
165070d1e83SPravin Jagtapdefine amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
166738bdd49SMatt Arsenault; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16:
167738bdd49SMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
168738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v2, v0, 16, 1
169738bdd49SMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
170738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v2, v2, v0, s0
171738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
172738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
173738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 1
174738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
175738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v2, v1, 16, 1
176738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v2, v2, v1, s0
177738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v1
178738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
179738bdd49SMatt Arsenault; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
180738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 0
181738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
182738bdd49SMatt Arsenault; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
183738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
184738bdd49SMatt Arsenault;
185738bdd49SMatt Arsenault; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16:
186738bdd49SMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
187738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
188738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
189070d1e83SPravin Jagtapentry:
190070d1e83SPravin Jagtap  %a.cvt = fptrunc float %a to bfloat
191070d1e83SPravin Jagtap  %b.cvt = fptrunc float %b to bfloat
192070d1e83SPravin Jagtap  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
193070d1e83SPravin Jagtap  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
194070d1e83SPravin Jagtap  %ret = bitcast <2 x bfloat> %v2.2 to float
195070d1e83SPravin Jagtap  ret float %ret
196070d1e83SPravin Jagtap}
197e52a6878SPravin Jagtap
198070d1e83SPravin Jagtapdefine amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
199738bdd49SMatt Arsenault; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
200738bdd49SMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
201738bdd49SMatt Arsenault; GFX-940-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
202738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v3, v2, 16, 1
203738bdd49SMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
204738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v3, v3, v2, s0
205738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v2
206738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
207738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 1
208738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
209738bdd49SMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
210738bdd49SMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v3, v2, 16, 1
211738bdd49SMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v3, v3, v2, s0
212738bdd49SMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v2
213738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, |v1|, |v1|
214738bdd49SMatt Arsenault; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
215738bdd49SMatt Arsenault; GFX-940-NEXT:    s_nop 0
216738bdd49SMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
217738bdd49SMatt Arsenault; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
218738bdd49SMatt Arsenault; GFX-940-NEXT:    ; return to shader part epilog
219738bdd49SMatt Arsenault;
220738bdd49SMatt Arsenault; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
221738bdd49SMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
222738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1|
223738bdd49SMatt Arsenault; GFX-950-NEXT:    ; return to shader part epilog
224070d1e83SPravin Jagtapentry:
225070d1e83SPravin Jagtap  %a.neg = fneg float %a
226070d1e83SPravin Jagtap  %a.cvt = fptrunc float %a.neg to bfloat
227070d1e83SPravin Jagtap  %b.abs = call float @llvm.fabs.f32(float %b)
228070d1e83SPravin Jagtap  %b.cvt = fptrunc float %b.abs to bfloat
229070d1e83SPravin Jagtap  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
230070d1e83SPravin Jagtap  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
231070d1e83SPravin Jagtap  %ret = bitcast <2 x bfloat> %v2.2 to float
232070d1e83SPravin Jagtap  ret float %ret
233070d1e83SPravin Jagtap}
234e52a6878SPravin Jagtap
235070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
236a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16:
237a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
238a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
239a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
240a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v1, v0, 16, 1
241a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
242a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v1, v1, v0, s0
243a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v4, 0x400000, v0
244a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
245a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
246a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
247a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
248a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
249a6fc489bSMatt Arsenault;
250a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16:
251a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
252a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
253a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
254738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
255738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
256a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
257070d1e83SPravin Jagtapentry:
258070d1e83SPravin Jagtap  %a.cvt = fptrunc float %a to bfloat
259070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
260070d1e83SPravin Jagtap  ret void
261070d1e83SPravin Jagtap}
262e52a6878SPravin Jagtap
263070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
264a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16_abs:
265a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
266a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
267a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
268a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
269a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
270a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
271a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
272a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
273a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
274a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
275a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
276a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
277a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
278a6fc489bSMatt Arsenault;
279a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16_abs:
280a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
281a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
282a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
283738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
284738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
285a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
286070d1e83SPravin Jagtapentry:
287070d1e83SPravin Jagtap  %a.abs = call float @llvm.fabs.f32(float %a)
288070d1e83SPravin Jagtap  %a.cvt = fptrunc float %a.abs to bfloat
289070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
290070d1e83SPravin Jagtap  ret void
291070d1e83SPravin Jagtap}
292e52a6878SPravin Jagtap
293070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
294a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16_neg:
295a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
296a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
297a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
298a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
299a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
300a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
301a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
302a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
303a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
304a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
305a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
306a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
307a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
308a6fc489bSMatt Arsenault;
309a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16_neg:
310a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
311a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
312a6fc489bSMatt Arsenault; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
313738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
314738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
315a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
316070d1e83SPravin Jagtapentry:
317070d1e83SPravin Jagtap  %a.neg = fneg float %a
318070d1e83SPravin Jagtap  %a.cvt = fptrunc float %a.neg to bfloat
319070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
320070d1e83SPravin Jagtap  ret void
321070d1e83SPravin Jagtap}
322e52a6878SPravin Jagtap
323070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
324a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16:
325a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
326a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
327a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
328a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v7, 1, v6
329a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
330a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
331a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
332a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
333a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add_u32_e32 v4, v6, v4
334a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
335a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
336a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_brev_b32 s0, 1
337a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_or_b32 v5, v1, s0, v4
338a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
339a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
340a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
341a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
342a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
343a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
344a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
345a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
346a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
347a6fc489bSMatt Arsenault;
348a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16:
349a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
350738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
351738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
352738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
353a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
354070d1e83SPravin Jagtapentry:
355070d1e83SPravin Jagtap  %a.cvt = fptrunc double %a to bfloat
356070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
357070d1e83SPravin Jagtap  ret void
358070d1e83SPravin Jagtap}
359e52a6878SPravin Jagtap
360070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
361a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16_neg:
362a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
363a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
364a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
365a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
366a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
367a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
368a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
369a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
370a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
371a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
372a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_brev_b32 s4, 1
373a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
374a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
375a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_or_b32 v5, v6, s4, v4
376a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
377a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
378a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
379a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
380a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
381a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
382a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
383a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
384a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
385a6fc489bSMatt Arsenault;
386a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16_neg:
387a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
388738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_f32_f64_e64 v0, -v[0:1]
389738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
390738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
391a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
392070d1e83SPravin Jagtapentry:
393070d1e83SPravin Jagtap  %a.neg = fneg double %a
394070d1e83SPravin Jagtap  %a.cvt = fptrunc double %a.neg to bfloat
395070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
396070d1e83SPravin Jagtap  ret void
397070d1e83SPravin Jagtap}
398e52a6878SPravin Jagtap
399070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
400a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16_abs:
401a6fc489bSMatt Arsenault; GFX-940:       ; %bb.0: ; %entry
402a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
403a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
404a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
405a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
406a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
407a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
408a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
409a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
410a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
411a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
412a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
413a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_brev_b32 s0, 1
414a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_and_or_b32 v5, v6, s0, v4
415a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
416a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
417a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
418a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
419a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
420a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_nop 1
421a6fc489bSMatt Arsenault; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
422a6fc489bSMatt Arsenault; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
423a6fc489bSMatt Arsenault; GFX-940-NEXT:    s_endpgm
424a6fc489bSMatt Arsenault;
425a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
426a6fc489bSMatt Arsenault; GFX-950:       ; %bb.0: ; %entry
427738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_f32_f64_e64 v0, |v[0:1]|
428738bdd49SMatt Arsenault; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
429738bdd49SMatt Arsenault; GFX-950-NEXT:    flat_store_short v[2:3], v0
430a6fc489bSMatt Arsenault; GFX-950-NEXT:    s_endpgm
431070d1e83SPravin Jagtapentry:
432070d1e83SPravin Jagtap  %a.abs = call double @llvm.fabs.f64(double %a)
433070d1e83SPravin Jagtap  %a.cvt = fptrunc double %a.abs to bfloat
434070d1e83SPravin Jagtap  store bfloat %a.cvt, ptr %out
435070d1e83SPravin Jagtap  ret void
436070d1e83SPravin Jagtap}
437070d1e83SPravin Jagtap
438070d1e83SPravin Jagtapdeclare float @llvm.fabs.f32(float)
439070d1e83SPravin Jagtapdeclare double @llvm.fabs.f64(double)
440