1070d1e83SPravin Jagtap; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2*9afaf9c6SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s 3*9afaf9c6SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s 4070d1e83SPravin Jagtap 5070d1e83SPravin Jagtap; TODO: Add global-isel when it can support bf16 6e52a6878SPravin Jagtap 7070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { 8070d1e83SPravin Jagtap; GCN-LABEL: v_test_cvt_bf16_f32_v: 9070d1e83SPravin Jagtap; GCN: ; %bb.0: 10070d1e83SPravin Jagtap; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11070d1e83SPravin Jagtap; GCN-NEXT: ; return to shader part epilog 12070d1e83SPravin Jagtap %cvt = fpext bfloat %v to float 13070d1e83SPravin Jagtap ret float %cvt 14070d1e83SPravin Jagtap} 15e52a6878SPravin Jagtap 16070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { 17070d1e83SPravin Jagtap; GCN-LABEL: v_test_cvt_bf16_f32_s: 18070d1e83SPravin Jagtap; GCN: ; %bb.0: 19070d1e83SPravin Jagtap; GCN-NEXT: s_lshl_b32 s0, s0, 16 20070d1e83SPravin Jagtap; GCN-NEXT: v_mov_b32_e32 v0, s0 21070d1e83SPravin Jagtap; GCN-NEXT: ; return to shader part epilog 22070d1e83SPravin Jagtap %cvt = fpext bfloat %v to float 23070d1e83SPravin Jagtap ret float %cvt 24070d1e83SPravin Jagtap} 25e52a6878SPravin Jagtap 26070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { 27738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_v: 28738bdd49SMatt Arsenault; GFX-940: ; %bb.0: 29738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 30738bdd49SMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 31738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 32738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 33738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 34738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 1 35738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 36738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 37738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 38738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 39738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40738bdd49SMatt Arsenault; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 41738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 0 42738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 43738bdd49SMatt Arsenault; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 44738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 45738bdd49SMatt Arsenault; 46738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_v: 47738bdd49SMatt Arsenault; GFX-950: ; %bb.0: 48738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 49738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 50070d1e83SPravin Jagtap %res = fptrunc <2 x float> %src to <2 x bfloat> 51070d1e83SPravin Jagtap %cast = bitcast <2 x bfloat> %res to float 52070d1e83SPravin Jagtap ret float %cast 53070d1e83SPravin Jagtap} 54e52a6878SPravin Jagtap 55070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { 56738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_s: 57738bdd49SMatt Arsenault; GFX-940: ; %bb.0: 58738bdd49SMatt Arsenault; GFX-940-NEXT: s_bfe_u32 s2, s1, 0x10010 59738bdd49SMatt Arsenault; GFX-940-NEXT: s_add_i32 s2, s2, s1 60738bdd49SMatt Arsenault; GFX-940-NEXT: s_or_b32 s4, s1, 0x400000 61738bdd49SMatt Arsenault; GFX-940-NEXT: s_add_i32 s5, s2, 0x7fff 62738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 63738bdd49SMatt Arsenault; GFX-940-NEXT: s_and_b64 s[2:3], s[2:3], exec 64738bdd49SMatt Arsenault; GFX-940-NEXT: s_cselect_b32 s1, s4, s5 65738bdd49SMatt Arsenault; GFX-940-NEXT: s_lshr_b32 s2, s1, 16 66738bdd49SMatt Arsenault; GFX-940-NEXT: s_bfe_u32 s1, s0, 0x10010 67738bdd49SMatt Arsenault; GFX-940-NEXT: s_add_i32 s1, s1, s0 68738bdd49SMatt Arsenault; GFX-940-NEXT: s_or_b32 s3, s0, 0x400000 69738bdd49SMatt Arsenault; GFX-940-NEXT: s_add_i32 s4, s1, 0x7fff 70738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 71738bdd49SMatt Arsenault; GFX-940-NEXT: s_and_b64 s[0:1], s[0:1], exec 72738bdd49SMatt Arsenault; GFX-940-NEXT: s_cselect_b32 s0, s3, s4 73738bdd49SMatt Arsenault; GFX-940-NEXT: s_lshr_b32 s0, s0, 16 74738bdd49SMatt Arsenault; GFX-940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 75738bdd49SMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v0, s0 76738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 77738bdd49SMatt Arsenault; 78738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_s: 79738bdd49SMatt Arsenault; GFX-950: ; %bb.0: 80738bdd49SMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v0, s1 81738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0 82738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 83070d1e83SPravin Jagtap %res = fptrunc <2 x float> %src to <2 x bfloat> 84070d1e83SPravin Jagtap %cast = bitcast <2 x bfloat> %res to float 85070d1e83SPravin Jagtap ret float %cast 86070d1e83SPravin Jagtap} 87e52a6878SPravin Jagtap 88070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { 89738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_f32_bf16_v: 90738bdd49SMatt Arsenault; GFX-940: ; %bb.0: 91738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 92738bdd49SMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 93738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 94738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v0 95738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 96738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 1 97738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 98738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 99738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 100738bdd49SMatt Arsenault; 101738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_f32_bf16_v: 102738bdd49SMatt Arsenault; GFX-950: ; %bb.0: 103738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 104738bdd49SMatt Arsenault; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 105738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 106070d1e83SPravin Jagtap %trunc = fptrunc float %src to bfloat 107070d1e83SPravin Jagtap %ext = fpext bfloat %trunc to float 108070d1e83SPravin Jagtap ret float %ext 109070d1e83SPravin Jagtap} 110e52a6878SPravin Jagtap 111070d1e83SPravin Jagtapdefine amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { 112738bdd49SMatt Arsenault; GFX-940-LABEL: v_test_cvt_v2f64_v2bf16_v: 113738bdd49SMatt Arsenault; GFX-940: ; %bb.0: 114738bdd49SMatt Arsenault; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 115738bdd49SMatt Arsenault; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 116738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 117738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 118738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 119738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 120738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 121738bdd49SMatt Arsenault; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 122738bdd49SMatt Arsenault; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 123738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 124738bdd49SMatt Arsenault; GFX-940-NEXT: s_brev_b32 s4, 1 125738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_or_b32 v5, v1, s4, v4 126738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 127738bdd49SMatt Arsenault; GFX-940-NEXT: s_movk_i32 s5, 0x7fff 128738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s5 129738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 130738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 131738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 1 132738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 133738bdd49SMatt Arsenault; GFX-940-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| 134738bdd49SMatt Arsenault; GFX-940-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 135738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v6, 1, v5 136738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] 137738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] 138738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 139738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] 140738bdd49SMatt Arsenault; GFX-940-NEXT: v_add_u32_e32 v0, v5, v0 141738bdd49SMatt Arsenault; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 142738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 143738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_or_b32 v1, v3, s4, v0 144738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v0, v0, 16, 1 145738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v0, v0, v1, s5 146738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 147738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] 148738bdd49SMatt Arsenault; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 149738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 0 150738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 151738bdd49SMatt Arsenault; GFX-940-NEXT: v_perm_b32 v0, v0, v4, s0 152738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 153738bdd49SMatt Arsenault; 154738bdd49SMatt Arsenault; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: 155738bdd49SMatt Arsenault; GFX-950: ; %bb.0: 156738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] 157738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 158738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 159738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 160070d1e83SPravin Jagtap %res = fptrunc <2 x double> %src to <2 x bfloat> 161070d1e83SPravin Jagtap %cast = bitcast <2 x bfloat> %res to float 162070d1e83SPravin Jagtap ret float %cast 163070d1e83SPravin Jagtap} 164e52a6878SPravin Jagtap 165070d1e83SPravin Jagtapdefine amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { 166738bdd49SMatt Arsenault; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16: 167738bdd49SMatt Arsenault; GFX-940: ; %bb.0: ; %entry 168738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 169738bdd49SMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 170738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 171738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 172738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 173738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 1 174738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 175738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 176738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 177738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 178738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 179738bdd49SMatt Arsenault; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 180738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 0 181738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 182738bdd49SMatt Arsenault; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 183738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 184738bdd49SMatt Arsenault; 185738bdd49SMatt Arsenault; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16: 186738bdd49SMatt Arsenault; GFX-950: ; %bb.0: ; %entry 187738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 188738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 189070d1e83SPravin Jagtapentry: 190070d1e83SPravin Jagtap %a.cvt = fptrunc float %a to bfloat 191070d1e83SPravin Jagtap %b.cvt = fptrunc float %b to bfloat 192070d1e83SPravin Jagtap %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 193070d1e83SPravin Jagtap %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 194070d1e83SPravin Jagtap %ret = bitcast <2 x bfloat> %v2.2 to float 195070d1e83SPravin Jagtap ret float %ret 196070d1e83SPravin Jagtap} 197e52a6878SPravin Jagtap 198070d1e83SPravin Jagtapdefine amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { 199738bdd49SMatt Arsenault; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16_mods: 200738bdd49SMatt Arsenault; GFX-940: ; %bb.0: ; %entry 201738bdd49SMatt Arsenault; GFX-940-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 202738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 203738bdd49SMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 204738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 205738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 206738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 207738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 1 208738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 209738bdd49SMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 210738bdd49SMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 211738bdd49SMatt Arsenault; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 212738bdd49SMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 213738bdd49SMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| 214738bdd49SMatt Arsenault; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 215738bdd49SMatt Arsenault; GFX-940-NEXT: s_nop 0 216738bdd49SMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 217738bdd49SMatt Arsenault; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 218738bdd49SMatt Arsenault; GFX-940-NEXT: ; return to shader part epilog 219738bdd49SMatt Arsenault; 220738bdd49SMatt Arsenault; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16_mods: 221738bdd49SMatt Arsenault; GFX-950: ; %bb.0: ; %entry 222738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1| 223738bdd49SMatt Arsenault; GFX-950-NEXT: ; return to shader part epilog 224070d1e83SPravin Jagtapentry: 225070d1e83SPravin Jagtap %a.neg = fneg float %a 226070d1e83SPravin Jagtap %a.cvt = fptrunc float %a.neg to bfloat 227070d1e83SPravin Jagtap %b.abs = call float @llvm.fabs.f32(float %b) 228070d1e83SPravin Jagtap %b.cvt = fptrunc float %b.abs to bfloat 229070d1e83SPravin Jagtap %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 230070d1e83SPravin Jagtap %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 231070d1e83SPravin Jagtap %ret = bitcast <2 x bfloat> %v2.2 to float 232070d1e83SPravin Jagtap ret float %ret 233070d1e83SPravin Jagtap} 234e52a6878SPravin Jagtap 235070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { 236a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16: 237a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 238a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v3, v2 239a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v2, v1 240a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 241a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 242a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 243a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v4, 0x400000, v0 244a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 245a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 246a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 247a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 248a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 249a6fc489bSMatt Arsenault; 250a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16: 251a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 252a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v3, v2 253a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v2, v1 254738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 255738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 256a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 257070d1e83SPravin Jagtapentry: 258070d1e83SPravin Jagtap %a.cvt = fptrunc float %a to bfloat 259070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 260070d1e83SPravin Jagtap ret void 261070d1e83SPravin Jagtap} 262e52a6878SPravin Jagtap 263070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { 264a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16_abs: 265a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 266a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v3, v2 267a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v2, v1 268a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 269a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 270a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 271a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 272a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 273a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| 274a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 275a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 276a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 277a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 278a6fc489bSMatt Arsenault; 279a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16_abs: 280a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 281a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v3, v2 282a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v2, v1 283738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 284738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 285a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 286070d1e83SPravin Jagtapentry: 287070d1e83SPravin Jagtap %a.abs = call float @llvm.fabs.f32(float %a) 288070d1e83SPravin Jagtap %a.cvt = fptrunc float %a.abs to bfloat 289070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 290070d1e83SPravin Jagtap ret void 291070d1e83SPravin Jagtap} 292e52a6878SPravin Jagtap 293070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { 294a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f32_to_bf16_neg: 295a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 296a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v3, v2 297a6fc489bSMatt Arsenault; GFX-940-NEXT: v_mov_b32_e32 v2, v1 298a6fc489bSMatt Arsenault; GFX-940-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 299a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 300a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 301a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 302a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 303a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 304a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 305a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 306a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 307a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 308a6fc489bSMatt Arsenault; 309a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f32_to_bf16_neg: 310a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 311a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v3, v2 312a6fc489bSMatt Arsenault; GFX-950-NEXT: v_mov_b32_e32 v2, v1 313738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 314738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 315a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 316070d1e83SPravin Jagtapentry: 317070d1e83SPravin Jagtap %a.neg = fneg float %a 318070d1e83SPravin Jagtap %a.cvt = fptrunc float %a.neg to bfloat 319070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 320070d1e83SPravin Jagtap ret void 321070d1e83SPravin Jagtap} 322e52a6878SPravin Jagtap 323070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { 324a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16: 325a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 326a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 327a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 328a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 329a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 330a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 331a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 332a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 333a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 334a6fc489bSMatt Arsenault; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 335a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 336a6fc489bSMatt Arsenault; GFX-940-NEXT: s_brev_b32 s0, 1 337a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_or_b32 v5, v1, s0, v4 338a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 339a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 340a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 341a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 342a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 343a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 344a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 345a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 346a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 347a6fc489bSMatt Arsenault; 348a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16: 349a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 350738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 351738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 352738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 353a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 354070d1e83SPravin Jagtapentry: 355070d1e83SPravin Jagtap %a.cvt = fptrunc double %a to bfloat 356070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 357070d1e83SPravin Jagtap ret void 358070d1e83SPravin Jagtap} 359e52a6878SPravin Jagtap 360070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { 361a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16_neg: 362a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 363a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| 364a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 365a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 366a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 367a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 368a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 369a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 370a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 371a6fc489bSMatt Arsenault; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 372a6fc489bSMatt Arsenault; GFX-940-NEXT: s_brev_b32 s4, 1 373a6fc489bSMatt Arsenault; GFX-940-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 374a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 375a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_or_b32 v5, v6, s4, v4 376a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 377a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 378a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 379a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 380a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] 381a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 382a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 383a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 384a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 385a6fc489bSMatt Arsenault; 386a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: 387a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 388738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, -v[0:1] 389738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 390738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 391a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 392070d1e83SPravin Jagtapentry: 393070d1e83SPravin Jagtap %a.neg = fneg double %a 394070d1e83SPravin Jagtap %a.cvt = fptrunc double %a.neg to bfloat 395070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 396070d1e83SPravin Jagtap ret void 397070d1e83SPravin Jagtap} 398e52a6878SPravin Jagtap 399070d1e83SPravin Jagtapdefine amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { 400a6fc489bSMatt Arsenault; GFX-940-LABEL: fptrunc_f64_to_bf16_abs: 401a6fc489bSMatt Arsenault; GFX-940: ; %bb.0: ; %entry 402a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| 403a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 404a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 405a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 406a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 407a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 408a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 409a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 410a6fc489bSMatt Arsenault; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 411a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 412a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 413a6fc489bSMatt Arsenault; GFX-940-NEXT: s_brev_b32 s0, 1 414a6fc489bSMatt Arsenault; GFX-940-NEXT: v_and_or_b32 v5, v6, s0, v4 415a6fc489bSMatt Arsenault; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 416a6fc489bSMatt Arsenault; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 417a6fc489bSMatt Arsenault; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 418a6fc489bSMatt Arsenault; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 419a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| 420a6fc489bSMatt Arsenault; GFX-940-NEXT: s_nop 1 421a6fc489bSMatt Arsenault; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 422a6fc489bSMatt Arsenault; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 423a6fc489bSMatt Arsenault; GFX-940-NEXT: s_endpgm 424a6fc489bSMatt Arsenault; 425a6fc489bSMatt Arsenault; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: 426a6fc489bSMatt Arsenault; GFX-950: ; %bb.0: ; %entry 427738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, |v[0:1]| 428738bdd49SMatt Arsenault; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 429738bdd49SMatt Arsenault; GFX-950-NEXT: flat_store_short v[2:3], v0 430a6fc489bSMatt Arsenault; GFX-950-NEXT: s_endpgm 431070d1e83SPravin Jagtapentry: 432070d1e83SPravin Jagtap %a.abs = call double @llvm.fabs.f64(double %a) 433070d1e83SPravin Jagtap %a.cvt = fptrunc double %a.abs to bfloat 434070d1e83SPravin Jagtap store bfloat %a.cvt, ptr %out 435070d1e83SPravin Jagtap ret void 436070d1e83SPravin Jagtap} 437070d1e83SPravin Jagtap 438070d1e83SPravin Jagtapdeclare float @llvm.fabs.f32(float) 439070d1e83SPravin Jagtapdeclare double @llvm.fabs.f64(double) 440