1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s 4 5; TODO: Add global-isel when it can support bf16 6 7define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { 8; GCN-LABEL: v_test_cvt_bf16_f32_v: 9; GCN: ; %bb.0: 10; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11; GCN-NEXT: ; return to shader part epilog 12 %cvt = fpext bfloat %v to float 13 ret float %cvt 14} 15 16define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { 17; GCN-LABEL: v_test_cvt_bf16_f32_s: 18; GCN: ; %bb.0: 19; GCN-NEXT: s_lshl_b32 s0, s0, 16 20; GCN-NEXT: v_mov_b32_e32 v0, s0 21; GCN-NEXT: ; return to shader part epilog 22 %cvt = fpext bfloat %v to float 23 ret float %cvt 24} 25 26define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { 27; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_v: 28; GFX-940: ; %bb.0: 29; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 30; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 31; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 32; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 33; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 34; GFX-940-NEXT: s_nop 1 35; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 36; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 37; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 38; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 39; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 41; GFX-940-NEXT: s_nop 0 42; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 43; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 44; GFX-940-NEXT: ; return to shader part epilog 45; 46; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_v: 47; GFX-950: ; %bb.0: 48; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 49; GFX-950-NEXT: ; return to shader part epilog 50 %res = fptrunc <2 x float> %src to <2 x bfloat> 51 %cast = bitcast <2 x bfloat> %res to float 52 ret float %cast 53} 54 55define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { 56; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_s: 57; GFX-940: ; %bb.0: 58; GFX-940-NEXT: s_bfe_u32 s2, s1, 0x10010 59; GFX-940-NEXT: s_add_i32 s2, s2, s1 60; GFX-940-NEXT: s_or_b32 s4, s1, 0x400000 61; GFX-940-NEXT: s_add_i32 s5, s2, 0x7fff 62; GFX-940-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 63; GFX-940-NEXT: s_and_b64 s[2:3], s[2:3], exec 64; GFX-940-NEXT: s_cselect_b32 s1, s4, s5 65; GFX-940-NEXT: s_lshr_b32 s2, s1, 16 66; GFX-940-NEXT: s_bfe_u32 s1, s0, 0x10010 67; GFX-940-NEXT: s_add_i32 s1, s1, s0 68; GFX-940-NEXT: s_or_b32 s3, s0, 0x400000 69; GFX-940-NEXT: s_add_i32 s4, s1, 0x7fff 70; GFX-940-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 71; GFX-940-NEXT: s_and_b64 s[0:1], s[0:1], exec 72; GFX-940-NEXT: s_cselect_b32 s0, s3, s4 73; GFX-940-NEXT: s_lshr_b32 s0, s0, 16 74; GFX-940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 75; GFX-940-NEXT: v_mov_b32_e32 v0, s0 76; GFX-940-NEXT: ; return to shader part epilog 77; 78; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_s: 79; GFX-950: ; %bb.0: 80; GFX-950-NEXT: v_mov_b32_e32 v0, s1 81; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0 82; GFX-950-NEXT: ; return to shader part epilog 83 %res = fptrunc <2 x float> %src to <2 x bfloat> 84 %cast = bitcast <2 x bfloat> %res to float 85 ret float %cast 86} 87 88define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { 89; GFX-940-LABEL: v_test_cvt_f32_bf16_v: 90; GFX-940: ; %bb.0: 91; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 92; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 93; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 94; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v0 95; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 96; GFX-940-NEXT: s_nop 1 97; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 98; GFX-940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 99; GFX-940-NEXT: ; return to shader part epilog 100; 101; GFX-950-LABEL: v_test_cvt_f32_bf16_v: 102; GFX-950: ; %bb.0: 103; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 104; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 105; GFX-950-NEXT: ; return to shader part epilog 106 %trunc = fptrunc float %src to bfloat 107 %ext = fpext bfloat %trunc to float 108 ret float %ext 109} 110 111define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { 112; GFX-940-LABEL: v_test_cvt_v2f64_v2bf16_v: 113; GFX-940: ; %bb.0: 114; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 115; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 116; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 117; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 118; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 119; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 120; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 121; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 122; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 123; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 124; GFX-940-NEXT: s_brev_b32 s4, 1 125; GFX-940-NEXT: v_and_or_b32 v5, v1, s4, v4 126; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 127; GFX-940-NEXT: s_movk_i32 s5, 0x7fff 128; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s5 129; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 130; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 131; GFX-940-NEXT: s_nop 1 132; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 133; GFX-940-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| 134; GFX-940-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 135; GFX-940-NEXT: v_and_b32_e32 v6, 1, v5 136; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] 137; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] 138; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 139; GFX-940-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] 140; GFX-940-NEXT: v_add_u32_e32 v0, v5, v0 141; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 142; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 143; GFX-940-NEXT: v_and_or_b32 v1, v3, s4, v0 144; GFX-940-NEXT: v_bfe_u32 v0, v0, 16, 1 145; GFX-940-NEXT: v_add3_u32 v0, v0, v1, s5 146; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 147; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] 148; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 149; GFX-940-NEXT: s_nop 0 150; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 151; GFX-940-NEXT: v_perm_b32 v0, v0, v4, s0 152; GFX-940-NEXT: ; return to shader part epilog 153; 154; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: 155; GFX-950: ; %bb.0: 156; GFX-950-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] 157; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 158; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 159; GFX-950-NEXT: ; return to shader part epilog 160 %res = fptrunc <2 x double> %src to <2 x bfloat> 161 %cast = bitcast <2 x bfloat> %res to float 162 ret float %cast 163} 164 165define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { 166; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16: 167; GFX-940: ; %bb.0: ; %entry 168; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 169; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 170; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 171; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 172; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 173; GFX-940-NEXT: s_nop 1 174; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 175; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 176; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 177; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 178; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 179; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 180; GFX-940-NEXT: s_nop 0 181; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 182; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 183; GFX-940-NEXT: ; return to shader part epilog 184; 185; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16: 186; GFX-950: ; %bb.0: ; %entry 187; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 188; GFX-950-NEXT: ; return to shader part epilog 189entry: 190 %a.cvt = fptrunc float %a to bfloat 191 %b.cvt = fptrunc float %b to bfloat 192 %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 193 %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 194 %ret = bitcast <2 x bfloat> %v2.2 to float 195 ret float %ret 196} 197 198define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { 199; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16_mods: 200; GFX-940: ; %bb.0: ; %entry 201; GFX-940-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 202; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 203; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 204; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 205; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 206; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 207; GFX-940-NEXT: s_nop 1 208; GFX-940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 209; GFX-940-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 210; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 211; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 212; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 213; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| 214; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 215; GFX-940-NEXT: s_nop 0 216; GFX-940-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 217; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 218; GFX-940-NEXT: ; return to shader part epilog 219; 220; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16_mods: 221; GFX-950: ; %bb.0: ; %entry 222; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1| 223; GFX-950-NEXT: ; return to shader part epilog 224entry: 225 %a.neg = fneg float %a 226 %a.cvt = fptrunc float %a.neg to bfloat 227 %b.abs = call float @llvm.fabs.f32(float %b) 228 %b.cvt = fptrunc float %b.abs to bfloat 229 %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 230 %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 231 %ret = bitcast <2 x bfloat> %v2.2 to float 232 ret float %ret 233} 234 235define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { 236; GFX-940-LABEL: fptrunc_f32_to_bf16: 237; GFX-940: ; %bb.0: ; %entry 238; GFX-940-NEXT: v_mov_b32_e32 v3, v2 239; GFX-940-NEXT: v_mov_b32_e32 v2, v1 240; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 241; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 242; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 243; GFX-940-NEXT: v_or_b32_e32 v4, 0x400000, v0 244; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 245; GFX-940-NEXT: s_nop 1 246; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 247; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 248; GFX-940-NEXT: s_endpgm 249; 250; GFX-950-LABEL: fptrunc_f32_to_bf16: 251; GFX-950: ; %bb.0: ; %entry 252; GFX-950-NEXT: v_mov_b32_e32 v3, v2 253; GFX-950-NEXT: v_mov_b32_e32 v2, v1 254; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 255; GFX-950-NEXT: flat_store_short v[2:3], v0 256; GFX-950-NEXT: s_endpgm 257entry: 258 %a.cvt = fptrunc float %a to bfloat 259 store bfloat %a.cvt, ptr %out 260 ret void 261} 262 263define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { 264; GFX-940-LABEL: fptrunc_f32_to_bf16_abs: 265; GFX-940: ; %bb.0: ; %entry 266; GFX-940-NEXT: v_mov_b32_e32 v3, v2 267; GFX-940-NEXT: v_mov_b32_e32 v2, v1 268; GFX-940-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 269; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 270; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 271; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 272; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 273; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| 274; GFX-940-NEXT: s_nop 1 275; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 276; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 277; GFX-940-NEXT: s_endpgm 278; 279; GFX-950-LABEL: fptrunc_f32_to_bf16_abs: 280; GFX-950: ; %bb.0: ; %entry 281; GFX-950-NEXT: v_mov_b32_e32 v3, v2 282; GFX-950-NEXT: v_mov_b32_e32 v2, v1 283; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 284; GFX-950-NEXT: flat_store_short v[2:3], v0 285; GFX-950-NEXT: s_endpgm 286entry: 287 %a.abs = call float @llvm.fabs.f32(float %a) 288 %a.cvt = fptrunc float %a.abs to bfloat 289 store bfloat %a.cvt, ptr %out 290 ret void 291} 292 293define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { 294; GFX-940-LABEL: fptrunc_f32_to_bf16_neg: 295; GFX-940: ; %bb.0: ; %entry 296; GFX-940-NEXT: v_mov_b32_e32 v3, v2 297; GFX-940-NEXT: v_mov_b32_e32 v2, v1 298; GFX-940-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 299; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 300; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 301; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 302; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 303; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 304; GFX-940-NEXT: s_nop 1 305; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 306; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 307; GFX-940-NEXT: s_endpgm 308; 309; GFX-950-LABEL: fptrunc_f32_to_bf16_neg: 310; GFX-950: ; %bb.0: ; %entry 311; GFX-950-NEXT: v_mov_b32_e32 v3, v2 312; GFX-950-NEXT: v_mov_b32_e32 v2, v1 313; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 314; GFX-950-NEXT: flat_store_short v[2:3], v0 315; GFX-950-NEXT: s_endpgm 316entry: 317 %a.neg = fneg float %a 318 %a.cvt = fptrunc float %a.neg to bfloat 319 store bfloat %a.cvt, ptr %out 320 ret void 321} 322 323define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { 324; GFX-940-LABEL: fptrunc_f64_to_bf16: 325; GFX-940: ; %bb.0: ; %entry 326; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 327; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 328; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 329; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 330; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 331; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 332; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 333; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 334; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 335; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 336; GFX-940-NEXT: s_brev_b32 s0, 1 337; GFX-940-NEXT: v_and_or_b32 v5, v1, s0, v4 338; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 339; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 340; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 341; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 342; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 343; GFX-940-NEXT: s_nop 1 344; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 345; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 346; GFX-940-NEXT: s_endpgm 347; 348; GFX-950-LABEL: fptrunc_f64_to_bf16: 349; GFX-950: ; %bb.0: ; %entry 350; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 351; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 352; GFX-950-NEXT: flat_store_short v[2:3], v0 353; GFX-950-NEXT: s_endpgm 354entry: 355 %a.cvt = fptrunc double %a to bfloat 356 store bfloat %a.cvt, ptr %out 357 ret void 358} 359 360define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { 361; GFX-940-LABEL: fptrunc_f64_to_bf16_neg: 362; GFX-940: ; %bb.0: ; %entry 363; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| 364; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 365; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 366; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 367; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 368; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 369; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 370; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 371; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 372; GFX-940-NEXT: s_brev_b32 s4, 1 373; GFX-940-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 374; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 375; GFX-940-NEXT: v_and_or_b32 v5, v6, s4, v4 376; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 377; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 378; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 379; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 380; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] 381; GFX-940-NEXT: s_nop 1 382; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 383; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 384; GFX-940-NEXT: s_endpgm 385; 386; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: 387; GFX-950: ; %bb.0: ; %entry 388; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, -v[0:1] 389; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 390; GFX-950-NEXT: flat_store_short v[2:3], v0 391; GFX-950-NEXT: s_endpgm 392entry: 393 %a.neg = fneg double %a 394 %a.cvt = fptrunc double %a.neg to bfloat 395 store bfloat %a.cvt, ptr %out 396 ret void 397} 398 399define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { 400; GFX-940-LABEL: fptrunc_f64_to_bf16_abs: 401; GFX-940: ; %bb.0: ; %entry 402; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| 403; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 404; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 405; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] 406; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] 407; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 408; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] 409; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 410; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc 411; GFX-940-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 412; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 413; GFX-940-NEXT: s_brev_b32 s0, 1 414; GFX-940-NEXT: v_and_or_b32 v5, v6, s0, v4 415; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 416; GFX-940-NEXT: s_movk_i32 s0, 0x7fff 417; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 418; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 419; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| 420; GFX-940-NEXT: s_nop 1 421; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 422; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 423; GFX-940-NEXT: s_endpgm 424; 425; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: 426; GFX-950: ; %bb.0: ; %entry 427; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, |v[0:1]| 428; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 429; GFX-950-NEXT: flat_store_short v[2:3], v0 430; GFX-950-NEXT: s_endpgm 431entry: 432 %a.abs = call double @llvm.fabs.f64(double %a) 433 %a.cvt = fptrunc double %a.abs to bfloat 434 store bfloat %a.cvt, ptr %out 435 ret void 436} 437 438declare float @llvm.fabs.f32(float) 439declare double @llvm.fabs.f64(double) 440