18f8016feSHugh Delaney; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s 3b279f6b0SFangrui Song; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 48f8016feSHugh Delaney 58f8016feSHugh Delaney; Using FTZ should emit fma.ftz.relu for f16, not for bf16 6b279f6b0SFangrui Song; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ 7b279f6b0SFangrui Song; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 88f8016feSHugh Delaney 98f8016feSHugh Delaney; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu 10b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70 118f8016feSHugh Delaney 128f8016feSHugh Delaneydefine half @fma_f16_expanded_no_nans(half %a, half %b, half %c) { 138f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_no_nans( 148f8016feSHugh Delaney; CHECK: { 158f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 168f8016feSHugh Delaney; CHECK-EMPTY: 178f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 188f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 198f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 208f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 218f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 228f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 238f8016feSHugh Delaney; CHECK-NEXT: ret; 248f8016feSHugh Delaney; 258f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans( 268f8016feSHugh Delaney; CHECK-FTZ: { 278f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 288f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 298f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 308f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 318f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 328f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 338f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 348f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 358f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 368f8016feSHugh Delaney; 378f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_no_nans( 388f8016feSHugh Delaney; CHECK-SM70: { 398f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 408f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 418f8016feSHugh Delaney; CHECK-SM70-EMPTY: 428f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 438f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 448f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 458f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 468f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 488f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 498f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 508f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 518f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 528f8016feSHugh Delaney %1 = fmul fast half %a, %b 538f8016feSHugh Delaney %2 = fadd fast half %1, %c 548f8016feSHugh Delaney %3 = fcmp nsz ogt half %2, 0.0 558f8016feSHugh Delaney %4 = select nsz i1 %3, half %2, half 0.0 568f8016feSHugh Delaney ret half %4 578f8016feSHugh Delaney} 588f8016feSHugh Delaney 598f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 608f8016feSHugh Delaneydefine half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) { 618f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 628f8016feSHugh Delaney; CHECK: { 638f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<10>; 648f8016feSHugh Delaney; CHECK-EMPTY: 658f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 668f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 678f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 688f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 698f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 708f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 718f8016feSHugh Delaney; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5; 728f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs7, 0x4700; 738f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16 %rs8, %rs4, %rs7; 748f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16 %rs9, %rs6, %rs8; 758f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs9; 768f8016feSHugh Delaney; CHECK-NEXT: ret; 778f8016feSHugh Delaney; 788f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 798f8016feSHugh Delaney; CHECK-FTZ: { 808f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<10>; 818f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 828f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 838f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 848f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 858f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 868f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 878f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 888f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16 %rs6, %rs4, %rs5; 898f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs7, 0x4700; 908f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16 %rs8, %rs4, %rs7; 918f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16 %rs9, %rs6, %rs8; 928f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs9; 938f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 948f8016feSHugh Delaney; 958f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 968f8016feSHugh Delaney; CHECK-SM70: { 978f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 988f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<10>; 998f8016feSHugh Delaney; CHECK-SM70-EMPTY: 1008f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 1018f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 1028f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 1038f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 1048f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 1058f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 1068f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 1078f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 1088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs7, 0x4700; 1098f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16 %rs8, %rs4, %rs7; 1108f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16 %rs9, %rs6, %rs8; 1118f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs9; 1128f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 1138f8016feSHugh Delaney %1 = fmul fast half %a, %b 1148f8016feSHugh Delaney %2 = fadd fast half %1, %c 1158f8016feSHugh Delaney %3 = fcmp nsz ogt half %2, 0.0 1168f8016feSHugh Delaney %4 = select nsz i1 %3, half %2, half 0.0 1178f8016feSHugh Delaney %5 = fadd half %2, 7.0 1188f8016feSHugh Delaney %6 = fadd half %4, %5 1198f8016feSHugh Delaney ret half %6 1208f8016feSHugh Delaney} 1218f8016feSHugh Delaney 1228f8016feSHugh Delaneydefine half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) { 1238f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_maxnum_no_nans( 1248f8016feSHugh Delaney; CHECK: { 1258f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 1268f8016feSHugh Delaney; CHECK-EMPTY: 1278f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 1288f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 1298f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 1308f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 1318f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 1328f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 1338f8016feSHugh Delaney; CHECK-NEXT: ret; 1348f8016feSHugh Delaney; 1358f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_no_nans( 1368f8016feSHugh Delaney; CHECK-FTZ: { 1378f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 1388f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 1398f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 1408f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 1418f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 1428f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 1438f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 1448f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 1458f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 1468f8016feSHugh Delaney; 1478f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans( 1488f8016feSHugh Delaney; CHECK-SM70: { 1498f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<6>; 1508f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<3>; 1518f8016feSHugh Delaney; CHECK-SM70-EMPTY: 1528f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 1538f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 1548f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 1558f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 1568f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 1578f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4; 1588f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 1598f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2; 1608f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; 1618f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 1628f8016feSHugh Delaney %1 = fmul fast half %a, %b 1638f8016feSHugh Delaney %2 = fadd fast half %1, %c 1648f8016feSHugh Delaney %3 = call nsz half @llvm.maxnum.f16(half %2, half 0.0) 1658f8016feSHugh Delaney ret half %3 1668f8016feSHugh Delaney} 1678f8016feSHugh Delaney 1688f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) { 1698f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_no_nans( 1708f8016feSHugh Delaney; CHECK: { 1718f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 1728f8016feSHugh Delaney; CHECK-EMPTY: 1738f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 1748f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 1758f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 1768f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 1778f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 1788f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 1798f8016feSHugh Delaney; CHECK-NEXT: ret; 1808f8016feSHugh Delaney; 1818f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans( 1828f8016feSHugh Delaney; CHECK-FTZ: { 1838f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 1848f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 1858f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 1868f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 1878f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 1888f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 1898f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 1908f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 1918f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 1928f8016feSHugh Delaney; 1938f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans( 1948f8016feSHugh Delaney; CHECK-SM70: { 1958f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 196310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 1978f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<14>; 1988f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<6>; 1998f8016feSHugh Delaney; CHECK-SM70-EMPTY: 2008f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 2018f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2]; 2028f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 2038f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 2048f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1]; 2058f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 2068f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 2078f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0]; 2088f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 2098f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 2108f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 2118f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 2128f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 2138f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 2148f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 2158f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 2168f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 2178f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 2188f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 2198f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 2208f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 2218f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 222310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 223310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 2248f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 2258f8016feSHugh Delaney %1 = fmul fast bfloat %a, %b 2268f8016feSHugh Delaney %2 = fadd fast bfloat %1, %c 2278f8016feSHugh Delaney %3 = fcmp nsz ogt bfloat %2, 0.0 2288f8016feSHugh Delaney %4 = select nsz i1 %3, bfloat %2, bfloat 0.0 2298f8016feSHugh Delaney ret bfloat %4 2308f8016feSHugh Delaney} 2318f8016feSHugh Delaney 2328f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 2338f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) { 2348f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 2358f8016feSHugh Delaney; CHECK: { 236*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b16 %rs<11>; 2378f8016feSHugh Delaney; CHECK-EMPTY: 2388f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 2398f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 2408f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 2418f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 2428f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 2438f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 2448f8016feSHugh Delaney; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5; 245*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs7, 0x3F80; 246*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs8, 0x40E0; 247*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs9, %rs4, %rs7, %rs8; 248*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs10, %rs6, %rs7, %rs9; 249*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b16 [func_retval0], %rs10; 2508f8016feSHugh Delaney; CHECK-NEXT: ret; 2518f8016feSHugh Delaney; 2528f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 2538f8016feSHugh Delaney; CHECK-FTZ: { 254310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; 2558f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 2568f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<6>; 2578f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 2588f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 2598f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 2608f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 2618f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 2628f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 2638f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 2648f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5; 2658f8016feSHugh Delaney; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4; 2668f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; 2678f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; 2688f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; 269310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %f2; 2708f8016feSHugh Delaney; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; 2718f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; 2728f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; 273310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs7; 2748f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; 2758f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f4, %r6; 2768f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f5, %f3, %f4; 277310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f5; 278310e7987SAlex MacLean; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; 2798f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 2808f8016feSHugh Delaney; 2818f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 2828f8016feSHugh Delaney; CHECK-SM70: { 2838f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 284310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<4>; 2858f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<29>; 2868f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<10>; 2878f8016feSHugh Delaney; CHECK-SM70-EMPTY: 2888f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 2898f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 2908f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 2918f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 2928f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 2938f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 2948f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 2958f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 2968f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 2978f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 2988f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 2998f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 3008f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 3018f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 3028f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 3038f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 3048f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 3058f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 3068f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 3078f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 3088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 3098f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 310310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 3118f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f6, %f5, 0f40E00000; 3128f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 3138f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 3148f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 3158f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 3168f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f6, %f6; 3178f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 3188f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3; 319310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2; 3208f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 3218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 3228f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536; 3238f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f8, %r22; 3248f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f9, %f7, %f8; 3258f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r23, %f9; 3268f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1; 3278f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23; 3288f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767; 3298f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9; 3308f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304; 3318f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4; 332310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; } 333310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; 3348f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 3358f8016feSHugh Delaney %1 = fmul fast bfloat %a, %b 3368f8016feSHugh Delaney %2 = fadd fast bfloat %1, %c 3378f8016feSHugh Delaney %3 = fcmp nsz ogt bfloat %2, 0.0 3388f8016feSHugh Delaney %4 = select nsz i1 %3, bfloat %2, bfloat 0.0 3398f8016feSHugh Delaney %5 = fadd bfloat %2, 7.0 3408f8016feSHugh Delaney %6 = fadd bfloat %4, %5 3418f8016feSHugh Delaney ret bfloat %6 3428f8016feSHugh Delaney} 3438f8016feSHugh Delaney 3448f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) { 3458f8016feSHugh Delaney; 3468f8016feSHugh Delaney; 3478f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_maxnum_no_nans( 3488f8016feSHugh Delaney; CHECK: { 3498f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 3508f8016feSHugh Delaney; CHECK-EMPTY: 3518f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 3528f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 3538f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 3548f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 3558f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 3568f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 3578f8016feSHugh Delaney; CHECK-NEXT: ret; 3588f8016feSHugh Delaney; 3598f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_no_nans( 3608f8016feSHugh Delaney; CHECK-FTZ: { 3618f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 3628f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 3638f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 3648f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 3658f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 3668f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 3678f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 3688f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 3698f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 3708f8016feSHugh Delaney; 3718f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans( 3728f8016feSHugh Delaney; CHECK-SM70: { 3738f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 374310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<2>; 3758f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<20>; 3768f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<7>; 3778f8016feSHugh Delaney; CHECK-SM70-EMPTY: 3788f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 3798f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; 3808f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 3818f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 3828f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; 3838f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 3848f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 3858f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; 3868f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 3878f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 3888f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 3898f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 3908f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 3918f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 3928f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 3938f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 3948f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 3958f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 3968f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 3978f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 3988f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000; 3998f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 4008f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 4018f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 4028f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 4038f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 4048f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 4058f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2; 4068f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; } 4078f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1; 4088f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 4098f8016feSHugh Delaney %1 = fmul fast bfloat %a, %b 4108f8016feSHugh Delaney %2 = fadd fast bfloat %1, %c 4118f8016feSHugh Delaney %3 = call nsz bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0) 4128f8016feSHugh Delaney ret bfloat %3 4138f8016feSHugh Delaney} 4148f8016feSHugh Delaney 4158f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 4168f8016feSHugh Delaney; 4178f8016feSHugh Delaney; 4188f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_no_nans( 4198f8016feSHugh Delaney; CHECK: { 4208f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 4218f8016feSHugh Delaney; CHECK-EMPTY: 4228f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 4238f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 4248f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 4258f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 4268f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 4278f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 4288f8016feSHugh Delaney; CHECK-NEXT: ret; 4298f8016feSHugh Delaney; 4308f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans( 4318f8016feSHugh Delaney; CHECK-FTZ: { 4328f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 4338f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 4348f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 4358f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 4368f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 4378f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 4388f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 4398f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 4408f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 4418f8016feSHugh Delaney; 4428f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans( 4438f8016feSHugh Delaney; CHECK-SM70: { 4448f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 4458f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 4468f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<7>; 4478f8016feSHugh Delaney; CHECK-SM70-EMPTY: 4488f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 4498f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 4508f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 4518f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 4528f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 4538f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 4548f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 4558f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 4568f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 4578f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 4588f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 4598f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 4608f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 4618f8016feSHugh Delaney %1 = fmul fast <2 x half> %a, %b 4628f8016feSHugh Delaney %2 = fadd fast <2 x half> %1, %c 4638f8016feSHugh Delaney %3 = fcmp nsz ogt <2 x half> %2, <half 0.0, half 0.0> 4648f8016feSHugh Delaney %4 = select nsz <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 4658f8016feSHugh Delaney ret <2 x half> %4 4668f8016feSHugh Delaney} 4678f8016feSHugh Delaney 4688f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 4698f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 4708f8016feSHugh Delaney; 4718f8016feSHugh Delaney; 4728f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 4738f8016feSHugh Delaney; CHECK: { 4748f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<10>; 4758f8016feSHugh Delaney; CHECK-EMPTY: 4768f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 4778f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 4788f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 4798f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 4808f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 4818f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 4828f8016feSHugh Delaney; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 4838f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r7, 1191200512; 4848f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16x2 %r8, %r4, %r7; 4858f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16x2 %r9, %r6, %r8; 4868f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r9; 4878f8016feSHugh Delaney; CHECK-NEXT: ret; 4888f8016feSHugh Delaney; 4898f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 4908f8016feSHugh Delaney; CHECK-FTZ: { 4918f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<10>; 4928f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 4938f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 4948f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 4958f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 4968f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 4978f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 4988f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 4998f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 5008f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r7, 1191200512; 5018f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16x2 %r8, %r4, %r7; 5028f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16x2 %r9, %r6, %r8; 5038f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r9; 5048f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 5058f8016feSHugh Delaney; 5068f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 5078f8016feSHugh Delaney; CHECK-SM70: { 5088f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 5098f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 5108f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<10>; 5118f8016feSHugh Delaney; CHECK-SM70-EMPTY: 5128f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 5138f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 5148f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 5158f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 5168f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 5178f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 5188f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 5198f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 5208f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 5218f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 5228f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 5238f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, 1191200512; 5248f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16x2 %r8, %r4, %r7; 5258f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16x2 %r9, %r6, %r8; 5268f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r9; 5278f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 5288f8016feSHugh Delaney %1 = fmul fast <2 x half> %a, %b 5298f8016feSHugh Delaney %2 = fadd fast <2 x half> %1, %c 5308f8016feSHugh Delaney %3 = fcmp nsz ogt <2 x half> %2, <half 0.0, half 0.0> 5318f8016feSHugh Delaney %4 = select nsz <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 5328f8016feSHugh Delaney %5 = fadd <2 x half> %2, <half 7.0, half 7.0> 5338f8016feSHugh Delaney %6 = fadd <2 x half> %4, %5 5348f8016feSHugh Delaney ret <2 x half> %6 5358f8016feSHugh Delaney} 5368f8016feSHugh Delaney 5378f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 5388f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_maxnum_no_nans( 5398f8016feSHugh Delaney; CHECK: { 5408f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 5418f8016feSHugh Delaney; CHECK-EMPTY: 5428f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 5438f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 5448f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 5458f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 5468f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 5478f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 5488f8016feSHugh Delaney; CHECK-NEXT: ret; 5498f8016feSHugh Delaney; 5508f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_maxnum_no_nans( 5518f8016feSHugh Delaney; CHECK-FTZ: { 5528f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 5538f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 5548f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 5558f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 5568f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 5578f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 5588f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 5598f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 5608f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 5618f8016feSHugh Delaney; 5628f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans( 5638f8016feSHugh Delaney; CHECK-SM70: { 5648f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 5658f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<6>; 5668f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<5>; 5678f8016feSHugh Delaney; CHECK-SM70-EMPTY: 5688f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 5698f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 5708f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 5718f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 5728f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 5738f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 5748f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs2; 5758f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 5768f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs3, %f2; 5778f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f3, %rs1; 5788f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f4, %f3, 0f00000000; 5798f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs4, %f4; 5808f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, {%rs4, %rs3}; 5818f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r5; 5828f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 5838f8016feSHugh Delaney %1 = fmul fast <2 x half> %a, %b 5848f8016feSHugh Delaney %2 = fadd fast <2 x half> %1, %c 5858f8016feSHugh Delaney %3 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>) 5868f8016feSHugh Delaney ret <2 x half> %3 5878f8016feSHugh Delaney} 5888f8016feSHugh Delaney 5898f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 5908f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_no_nans( 5918f8016feSHugh Delaney; CHECK: { 5928f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 5938f8016feSHugh Delaney; CHECK-EMPTY: 5948f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 5958f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 5968f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 5978f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 5988f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 5998f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 6008f8016feSHugh Delaney; CHECK-NEXT: ret; 6018f8016feSHugh Delaney; 6028f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans( 6038f8016feSHugh Delaney; CHECK-FTZ: { 6048f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 6058f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 6068f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 6078f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 6088f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 6098f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 6108f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 6118f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 6128f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 6138f8016feSHugh Delaney; 6148f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans( 6158f8016feSHugh Delaney; CHECK-SM70: { 6168f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 617310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 6188f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<31>; 6198f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<11>; 6208f8016feSHugh Delaney; CHECK-SM70-EMPTY: 6218f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 6228f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0]; 6238f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 6248f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2]; 6258f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 6268f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 6278f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 6288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 629310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 630310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 6318f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 6328f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 633310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 634310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 6358f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 6368f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 6378f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 6388f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 6398f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 6408f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 6418f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 6428f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 6438f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 6448f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 645310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 6468f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 6478f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 6488f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 649310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 6508f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 6518f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 652310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 6538f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 6548f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 6558f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 6568f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 6578f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 6588f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 6598f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 6608f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 6618f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 6628f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 663310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 6648f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 6658f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 6668f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 6678f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 6688f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 6698f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 670310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 671310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 672310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 6738f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 6748f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 6758f8016feSHugh Delaney %1 = fmul fast <2 x bfloat> %a, %b 6768f8016feSHugh Delaney %2 = fadd fast <2 x bfloat> %1, %c 6778f8016feSHugh Delaney %3 = fcmp nsz ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 6788f8016feSHugh Delaney %4 = select nsz <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 6798f8016feSHugh Delaney ret <2 x bfloat> %4 6808f8016feSHugh Delaney} 6818f8016feSHugh Delaney 6828f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 6838f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 6848f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 6858f8016feSHugh Delaney; CHECK: { 686*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b32 %r<11>; 6878f8016feSHugh Delaney; CHECK-EMPTY: 6888f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 6898f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 6908f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 6918f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 6928f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 6938f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 6948f8016feSHugh Delaney; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5; 695*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r7, 1065369472; 696*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r8, 1088438496; 697*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r9, %r4, %r7, %r8; 698*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r10, %r6, %r7, %r9; 699*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b32 [func_retval0], %r10; 7008f8016feSHugh Delaney; CHECK-NEXT: ret; 7018f8016feSHugh Delaney; 7028f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 7038f8016feSHugh Delaney; CHECK-FTZ: { 704310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 7058f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<20>; 7068f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<11>; 7078f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 7088f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 7098f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 7108f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 7118f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 7128f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 7138f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 7148f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5; 7158f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4; 7164b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs2; 7178f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; 7188f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r8; 7198f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; 720310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; 7214b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1; 7228f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; 7238f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r10; 7248f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, 0f40E00000; 725310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; 726310e7987SAlex MacLean; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6; 727310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs5; 7288f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; 7298f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f5, %r12; 730310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4; 7318f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16; 7328f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f6, %r14; 7338f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f7, %f5, %f6; 734310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs6; 7358f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16; 7368f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f8, %r16; 737310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3; 7388f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r18, %r17, 16; 7398f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f9, %r18; 7408f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f10, %f8, %f9; 7414b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r19, %f10, %f7; 7428f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r19; 7438f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 7448f8016feSHugh Delaney; 7458f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 7468f8016feSHugh Delaney; CHECK-SM70: { 7478f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<9>; 748310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 749310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<61>; 7508f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<19>; 7518f8016feSHugh Delaney; CHECK-SM70-EMPTY: 7528f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 7538f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 7548f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 7558f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 7568f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 7578f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; 7588f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 7598f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 760310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 761310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; 7628f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 7638f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 764310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 765310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; 7668f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 7678f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 7688f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 7698f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 7708f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 7718f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 7728f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 7738f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 7748f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 7758f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 776310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 7778f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; 7788f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 7798f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 780310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; 7818f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 7828f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 783310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; 7848f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 7858f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 7868f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 7878f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 7888f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 7898f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 7908f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 7918f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 7928f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 7938f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 794310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 7958f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 7968f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 7978f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 7988f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 7998f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 8008f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 801310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 802310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 8038f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f11, %f10, 0f40E00000; 8048f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r30, %f11; 8058f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1; 8068f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r30; 8078f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767; 8088f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f11, %f11; 8098f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r34, %r30, 4194304; 8108f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p5; 8118f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f12, %f9, 0f40E00000; 8128f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 8138f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 8148f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 8158f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 8168f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f12, %f12; 8178f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 8188f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6; 819310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10; 8208f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16; 8218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f13, %r43; 8228f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536; 8238f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f14, %r44; 8248f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f15, %f13, %f14; 8258f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r45, %f15; 8268f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r46, %r45, 16, 1; 8278f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r47, %r46, %r45; 8288f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r48, %r47, 32767; 8298f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15; 8308f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304; 8318f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7; 832310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9; 8338f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16; 8348f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f16, %r52; 8358f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536; 8368f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f17, %r53; 8378f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f18, %f16, %f17; 8388f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r54, %f18; 8398f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r55, %r54, 16, 1; 8408f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r56, %r55, %r54; 8418f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r57, %r56, 32767; 8428f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18; 8438f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304; 8448f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8; 845a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U; 8468f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60; 8478f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 8488f8016feSHugh Delaney %1 = fmul fast <2 x bfloat> %a, %b 8498f8016feSHugh Delaney %2 = fadd fast <2 x bfloat> %1, %c 8508f8016feSHugh Delaney %3 = fcmp nsz ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 8518f8016feSHugh Delaney %4 = select nsz <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 8528f8016feSHugh Delaney %5 = fadd <2 x bfloat> %2, <bfloat 7.0, bfloat 7.0> 8538f8016feSHugh Delaney %6 = fadd <2 x bfloat> %4, %5 8548f8016feSHugh Delaney ret <2 x bfloat> %6 8558f8016feSHugh Delaney} 8568f8016feSHugh Delaney 8578f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 8588f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 8598f8016feSHugh Delaney; CHECK: { 8608f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 8618f8016feSHugh Delaney; CHECK-EMPTY: 8628f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 8638f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 8648f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 8658f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 8668f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 8678f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 8688f8016feSHugh Delaney; CHECK-NEXT: ret; 8698f8016feSHugh Delaney; 8708f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 8718f8016feSHugh Delaney; CHECK-FTZ: { 8728f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 8738f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 8748f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 8758f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 8768f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 8778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 8788f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 8798f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 8808f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 8818f8016feSHugh Delaney; 8828f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 8838f8016feSHugh Delaney; CHECK-SM70: { 8848f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 885310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 886310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<43>; 8878f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<13>; 8888f8016feSHugh Delaney; CHECK-SM70-EMPTY: 8898f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 8908f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 8918f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 8928f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 8938f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 8948f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 8958f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 8968f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 897310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 898310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 8998f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 9008f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 901310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 902310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 9038f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 9048f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 9058f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 9068f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 9078f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 9088f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 9098f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 9108f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 9118f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 9128f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 9138f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 9148f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 9158f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 916310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 9178f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 9188f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 919310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 9208f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 9218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 9228f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 9238f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 9248f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 9258f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 9268f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 9278f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 9288f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 9298f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 9308f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536; 9318f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 9328f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f10, %f9, 0f00000000; 9338f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r29, %f10; 9348f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1; 9358f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29; 9368f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767; 9378f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10; 9388f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304; 9398f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3; 9408f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536; 9418f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f11, %r35; 9428f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000; 9438f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 9448f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 9458f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 9468f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 9478f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12; 9488f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 9498f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4; 950a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U; 9518f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42; 9528f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 9538f8016feSHugh Delaney %1 = fmul fast <2 x bfloat> %a, %b 9548f8016feSHugh Delaney %2 = fadd fast <2 x bfloat> %1, %c 9558f8016feSHugh Delaney %3 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) 9568f8016feSHugh Delaney ret <2 x bfloat> %3 9578f8016feSHugh Delaney} 9588f8016feSHugh Delaney 9598f8016feSHugh Delaneydefine half @fma_f16_no_nans(half %a, half %b, half %c) { 9608f8016feSHugh Delaney; CHECK-LABEL: fma_f16_no_nans( 9618f8016feSHugh Delaney; CHECK: { 9628f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 9638f8016feSHugh Delaney; CHECK-EMPTY: 9648f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 9658f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_param_0]; 9668f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_param_1]; 9678f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_param_2]; 9688f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 9698f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 9708f8016feSHugh Delaney; CHECK-NEXT: ret; 9718f8016feSHugh Delaney; 9728f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_no_nans( 9738f8016feSHugh Delaney; CHECK-FTZ: { 9748f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 9758f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 9768f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 9778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_param_0]; 9788f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_param_1]; 9798f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_param_2]; 9808f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 9818f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 9828f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 9838f8016feSHugh Delaney; 9848f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_no_nans( 9858f8016feSHugh Delaney; CHECK-SM70: { 9868f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 9878f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 9888f8016feSHugh Delaney; CHECK-SM70-EMPTY: 9898f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 9908f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_param_0]; 9918f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_param_1]; 9928f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_param_2]; 9938f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 9948f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 9958f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 9968f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 9978f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 9988f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 9998f8016feSHugh Delaney %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c) 10008f8016feSHugh Delaney %2 = fcmp nsz ogt half %1, 0.0 10018f8016feSHugh Delaney %3 = select nsz i1 %2, half %1, half 0.0 10028f8016feSHugh Delaney ret half %3 10038f8016feSHugh Delaney} 10048f8016feSHugh Delaney 10058f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 10068f8016feSHugh Delaneydefine half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) { 10078f8016feSHugh Delaney; CHECK-LABEL: fma_f16_no_nans_multiple_uses_of_fma( 10088f8016feSHugh Delaney; CHECK: { 10098f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<8>; 10108f8016feSHugh Delaney; CHECK-EMPTY: 10118f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 10128f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_multiple_uses_of_fma_param_0]; 10138f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_multiple_uses_of_fma_param_1]; 10148f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_multiple_uses_of_fma_param_2]; 10158f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 10168f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x4700; 10178f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16 %rs6, %rs4, %rs5; 10188f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16 %rs7, %rs6, %rs4; 10198f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; 10208f8016feSHugh Delaney; CHECK-NEXT: ret; 10218f8016feSHugh Delaney; 10228f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_no_nans_multiple_uses_of_fma( 10238f8016feSHugh Delaney; CHECK-FTZ: { 10248f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<8>; 10258f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 10268f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 10278f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_multiple_uses_of_fma_param_0]; 10288f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_multiple_uses_of_fma_param_1]; 10298f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_multiple_uses_of_fma_param_2]; 10308f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 10318f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x4700; 10328f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16 %rs6, %rs4, %rs5; 10338f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16 %rs7, %rs6, %rs4; 10348f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs7; 10358f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 10368f8016feSHugh Delaney; 10378f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_no_nans_multiple_uses_of_fma( 10388f8016feSHugh Delaney; CHECK-SM70: { 10398f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<8>; 10408f8016feSHugh Delaney; CHECK-SM70-EMPTY: 10418f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 10428f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_no_nans_multiple_uses_of_fma_param_0]; 10438f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_no_nans_multiple_uses_of_fma_param_1]; 10448f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_no_nans_multiple_uses_of_fma_param_2]; 10458f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 10468f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x4700; 10478f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16 %rs6, %rs4, %rs5; 10488f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16 %rs7, %rs6, %rs4; 10498f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs7; 10508f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 10518f8016feSHugh Delaney %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c) 10528f8016feSHugh Delaney %2 = fcmp nsz ogt half %1, 0.0 10538f8016feSHugh Delaney %3 = select nsz i1 %2, half %1, half 0.0 10548f8016feSHugh Delaney %4 = fadd half %1, 7.0 10558f8016feSHugh Delaney %5 = fadd half %4, %1 10568f8016feSHugh Delaney ret half %5 10578f8016feSHugh Delaney} 10588f8016feSHugh Delaney 10598f8016feSHugh Delaneydefine half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) { 10608f8016feSHugh Delaney; CHECK-LABEL: fma_f16_maxnum_no_nans( 10618f8016feSHugh Delaney; CHECK: { 10628f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 10638f8016feSHugh Delaney; CHECK-EMPTY: 10648f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 10658f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_maxnum_no_nans_param_0]; 10668f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_maxnum_no_nans_param_1]; 10678f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_maxnum_no_nans_param_2]; 10688f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 10698f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 10708f8016feSHugh Delaney; CHECK-NEXT: ret; 10718f8016feSHugh Delaney; 10728f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_maxnum_no_nans( 10738f8016feSHugh Delaney; CHECK-FTZ: { 10748f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 10758f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 10768f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 10778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_maxnum_no_nans_param_0]; 10788f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_maxnum_no_nans_param_1]; 10798f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_maxnum_no_nans_param_2]; 10808f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 10818f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 10828f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 10838f8016feSHugh Delaney; 10848f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_maxnum_no_nans( 10858f8016feSHugh Delaney; CHECK-SM70: { 10868f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<6>; 10878f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<3>; 10888f8016feSHugh Delaney; CHECK-SM70-EMPTY: 10898f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 10908f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_maxnum_no_nans_param_0]; 10918f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_maxnum_no_nans_param_1]; 10928f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_maxnum_no_nans_param_2]; 10938f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 10948f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4; 10958f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 10968f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2; 10978f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; 10988f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 10998f8016feSHugh Delaney %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c) 11008f8016feSHugh Delaney %2 = call nsz half @llvm.maxnum.f16(half %1, half 0.0) 11018f8016feSHugh Delaney ret half %2 11028f8016feSHugh Delaney} 11038f8016feSHugh Delaney 11048f8016feSHugh Delaneydefine bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { 11058f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_no_nans( 11068f8016feSHugh Delaney; CHECK: { 11078f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 11088f8016feSHugh Delaney; CHECK-EMPTY: 11098f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 11108f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_param_0]; 11118f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_param_1]; 11128f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_param_2]; 11138f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 11148f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 11158f8016feSHugh Delaney; CHECK-NEXT: ret; 11168f8016feSHugh Delaney; 11178f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_no_nans( 11188f8016feSHugh Delaney; CHECK-FTZ: { 11198f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 11208f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 11218f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 11228f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_param_0]; 11238f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_param_1]; 11248f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_param_2]; 11258f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 11268f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 11278f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 11288f8016feSHugh Delaney; 11298f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_no_nans( 11308f8016feSHugh Delaney; CHECK-SM70: { 11318f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 1132310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 11338f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<14>; 11348f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<6>; 11358f8016feSHugh Delaney; CHECK-SM70-EMPTY: 11368f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 11378f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_param_2]; 11388f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 11398f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 11408f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_param_1]; 11418f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 11428f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 11438f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_param_0]; 11448f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 11458f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 11468f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 11478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 11488f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 11498f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 11508f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 11518f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 11528f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 11538f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 11548f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 11558f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 11568f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 11578f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 1158310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 1159310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 11608f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 11618f8016feSHugh Delaney %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) 11628f8016feSHugh Delaney %2 = fcmp nsz ogt bfloat %1, 0.0 11638f8016feSHugh Delaney %3 = select nsz i1 %2, bfloat %1, bfloat 0.0 11648f8016feSHugh Delaney ret bfloat %3 11658f8016feSHugh Delaney} 11668f8016feSHugh Delaney 11678f8016feSHugh Delaney; FMA_relu shouldn't be selected if the FMA operation has multiple uses 11688f8016feSHugh Delaneydefine bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) { 11698f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( 11708f8016feSHugh Delaney; CHECK: { 1171*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b16 %rs<9>; 11728f8016feSHugh Delaney; CHECK-EMPTY: 11738f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 11748f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; 11758f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; 11768f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; 11778f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 1178*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs5, 0x3F80; 1179*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs6, 0x40E0; 1180*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs7, %rs4, %rs5, %rs6; 1181*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs8, %rs7, %rs5, %rs4; 1182*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b16 [func_retval0], %rs8; 11838f8016feSHugh Delaney; CHECK-NEXT: ret; 11848f8016feSHugh Delaney; 11858f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( 11868f8016feSHugh Delaney; CHECK-FTZ: { 1187310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 11888f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 11898f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<5>; 11908f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 11918f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 11928f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; 11938f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; 11948f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; 11958f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 11968f8016feSHugh Delaney; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4; 11978f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; 11988f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; 11998f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; 1200310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f2; 1201310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs5; 12028f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; 12038f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; 12048f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, %f1; 1205310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f4; 1206310e7987SAlex MacLean; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; 12078f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 12088f8016feSHugh Delaney; 12098f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( 12108f8016feSHugh Delaney; CHECK-SM70: { 12118f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<4>; 1212310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<2>; 12138f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<27>; 12148f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<9>; 12158f8016feSHugh Delaney; CHECK-SM70-EMPTY: 12168f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 12178f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; 12188f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 12198f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 12208f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; 12218f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 12228f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 12238f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; 12248f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 12258f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 12268f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 12278f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 12288f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 12298f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 12308f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 12318f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 12328f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 12338f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 12348f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 12358f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 12368f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f6, %f5, 0f40E00000; 12378f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 12388f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 12398f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 12408f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 12418f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 12428f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 12438f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2; 12448f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r20, %r19, -65536; 12458f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r20; 12468f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f8, %f7, %f5; 12478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r21, %f8; 12488f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r22, %r21, 16, 1; 12498f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r23, %r22, %r21; 12508f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, 32767; 12518f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f8, %f8; 12528f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r25, %r21, 4194304; 12538f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r26, %r25, %r24, %p3; 12548f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; } 12558f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1; 12568f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 12578f8016feSHugh Delaney %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) 12588f8016feSHugh Delaney %2 = fcmp nsz ogt bfloat %1, 0.0 12598f8016feSHugh Delaney %3 = select nsz i1 %2, bfloat %1, bfloat 0.0 12608f8016feSHugh Delaney %4 = fadd bfloat %1, 7.0 12618f8016feSHugh Delaney %5 = fadd bfloat %4, %1 12628f8016feSHugh Delaney ret bfloat %5 12638f8016feSHugh Delaney} 12648f8016feSHugh Delaney 12658f8016feSHugh Delaneydefine bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) { 12668f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_maxnum_no_nans( 12678f8016feSHugh Delaney; CHECK: { 12688f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 12698f8016feSHugh Delaney; CHECK-EMPTY: 12708f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 12718f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_maxnum_no_nans_param_0]; 12728f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_maxnum_no_nans_param_1]; 12738f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_maxnum_no_nans_param_2]; 12748f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 12758f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 12768f8016feSHugh Delaney; CHECK-NEXT: ret; 12778f8016feSHugh Delaney; 12788f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_maxnum_no_nans( 12798f8016feSHugh Delaney; CHECK-FTZ: { 12808f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 12818f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 12828f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 12838f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_maxnum_no_nans_param_0]; 12848f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_maxnum_no_nans_param_1]; 12858f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_maxnum_no_nans_param_2]; 12868f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 12878f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 12888f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 12898f8016feSHugh Delaney; 12908f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_maxnum_no_nans( 12918f8016feSHugh Delaney; CHECK-SM70: { 12928f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 1293310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<2>; 12948f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<20>; 12958f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<7>; 12968f8016feSHugh Delaney; CHECK-SM70-EMPTY: 12978f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 12988f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2]; 12998f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 13008f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 13018f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1]; 13028f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 13038f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 13048f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0]; 13058f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 13068f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 13078f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 13088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 13098f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 13108f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 13118f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 13128f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 13138f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 13148f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 13158f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 13168f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 13178f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000; 13188f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 13198f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 13208f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 13218f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 13228f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 13238f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 13248f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2; 13258f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; } 13268f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1; 13278f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 13288f8016feSHugh Delaney %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) 13298f8016feSHugh Delaney %2 = call nsz bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0) 13308f8016feSHugh Delaney ret bfloat %2 13318f8016feSHugh Delaney} 13328f8016feSHugh Delaney 13338f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 13348f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_no_nans( 13358f8016feSHugh Delaney; CHECK: { 13368f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<7>; 13378f8016feSHugh Delaney; CHECK-EMPTY: 13388f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 13398f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_param_2]; 13408f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_param_1]; 13418f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_param_0]; 13428f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 13438f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 13448f8016feSHugh Delaney; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 13458f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r6; 13468f8016feSHugh Delaney; CHECK-NEXT: ret; 13478f8016feSHugh Delaney; 13488f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_no_nans( 13498f8016feSHugh Delaney; CHECK-FTZ: { 13508f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 13518f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 13528f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 13538f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_param_2]; 13548f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_param_1]; 13558f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_param_0]; 13568f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 13578f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 13588f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 13598f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r6; 13608f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 13618f8016feSHugh Delaney; 13628f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_no_nans( 13638f8016feSHugh Delaney; CHECK-SM70: { 13648f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 13658f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 13668f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<7>; 13678f8016feSHugh Delaney; CHECK-SM70-EMPTY: 13688f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 13698f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_param_2]; 13708f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_param_1]; 13718f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_param_0]; 13728f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 13738f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 13748f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 13758f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 13768f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 13778f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 13788f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 13798f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 13808f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 13818f8016feSHugh Delaney %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) 13828f8016feSHugh Delaney %2 = fcmp nsz ogt <2 x half> %1, <half 0.0, half 0.0> 13838f8016feSHugh Delaney %3 = select nsz <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0> 13848f8016feSHugh Delaney ret <2 x half> %3 13858f8016feSHugh Delaney} 13868f8016feSHugh Delaney 13878f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 13888f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 13898f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_no_nans_multiple_uses_of_fma( 13908f8016feSHugh Delaney; CHECK: { 13918f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<8>; 13928f8016feSHugh Delaney; CHECK-EMPTY: 13938f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 13948f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_multiple_uses_of_fma_param_2]; 13958f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_multiple_uses_of_fma_param_1]; 13968f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_multiple_uses_of_fma_param_0]; 13978f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 13988f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 1191200512; 13998f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16x2 %r6, %r4, %r5; 14008f8016feSHugh Delaney; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r4; 14018f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 14028f8016feSHugh Delaney; CHECK-NEXT: ret; 14038f8016feSHugh Delaney; 14048f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_no_nans_multiple_uses_of_fma( 14058f8016feSHugh Delaney; CHECK-FTZ: { 14068f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<8>; 14078f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 14088f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 14098f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_multiple_uses_of_fma_param_2]; 14108f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_multiple_uses_of_fma_param_1]; 14118f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_multiple_uses_of_fma_param_0]; 14128f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 14138f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 1191200512; 14148f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16x2 %r6, %r4, %r5; 14158f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f16x2 %r7, %r6, %r4; 14168f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r7; 14178f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 14188f8016feSHugh Delaney; 14198f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_no_nans_multiple_uses_of_fma( 14208f8016feSHugh Delaney; CHECK-SM70: { 14218f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<8>; 14228f8016feSHugh Delaney; CHECK-SM70-EMPTY: 14238f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 14248f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_no_nans_multiple_uses_of_fma_param_2]; 14258f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_no_nans_multiple_uses_of_fma_param_1]; 14268f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_no_nans_multiple_uses_of_fma_param_0]; 14278f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 14288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 1191200512; 14298f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16x2 %r6, %r4, %r5; 14308f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f16x2 %r7, %r6, %r4; 14318f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r7; 14328f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 14338f8016feSHugh Delaney %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) 14348f8016feSHugh Delaney %2 = fcmp nsz ogt <2 x half> %1, <half 0.0, half 0.0> 14358f8016feSHugh Delaney %3 = select nsz <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0> 14368f8016feSHugh Delaney %4 = fadd <2 x half> %1, <half 7.0, half 7.0> 14378f8016feSHugh Delaney %5 = fadd <2 x half> %4, %1 14388f8016feSHugh Delaney ret <2 x half> %5 14398f8016feSHugh Delaney} 14408f8016feSHugh Delaney 14418f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { 14428f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_maxnum_no_nans( 14438f8016feSHugh Delaney; CHECK: { 14448f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 14458f8016feSHugh Delaney; CHECK-EMPTY: 14468f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 14478f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_maxnum_no_nans_param_2]; 14488f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_maxnum_no_nans_param_1]; 14498f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_maxnum_no_nans_param_0]; 14508f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 14518f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 14528f8016feSHugh Delaney; CHECK-NEXT: ret; 14538f8016feSHugh Delaney; 14548f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_maxnum_no_nans( 14558f8016feSHugh Delaney; CHECK-FTZ: { 14568f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 14578f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 14588f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 14598f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_maxnum_no_nans_param_2]; 14608f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_maxnum_no_nans_param_1]; 14618f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_maxnum_no_nans_param_0]; 14628f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 14638f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 14648f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 14658f8016feSHugh Delaney; 14668f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_maxnum_no_nans( 14678f8016feSHugh Delaney; CHECK-SM70: { 14688f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 14698f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<6>; 14708f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<5>; 14718f8016feSHugh Delaney; CHECK-SM70-EMPTY: 14728f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 14738f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_maxnum_no_nans_param_2]; 14748f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_maxnum_no_nans_param_1]; 14758f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_maxnum_no_nans_param_0]; 14768f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 14778f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 14788f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs2; 14798f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 14808f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs3, %f2; 14818f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f3, %rs1; 14828f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f4, %f3, 0f00000000; 14838f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs4, %f4; 14848f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, {%rs4, %rs3}; 14858f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r5; 14868f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 14878f8016feSHugh Delaney %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) 14888f8016feSHugh Delaney %2 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>) 14898f8016feSHugh Delaney ret <2 x half> %2 14908f8016feSHugh Delaney} 14918f8016feSHugh Delaney 14928f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 14938f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_no_nans( 14948f8016feSHugh Delaney; CHECK: { 14958f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 14968f8016feSHugh Delaney; CHECK-EMPTY: 14978f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 14988f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_param_2]; 14998f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_param_1]; 15008f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_param_0]; 15018f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 15028f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 15038f8016feSHugh Delaney; CHECK-NEXT: ret; 15048f8016feSHugh Delaney; 15058f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_no_nans( 15068f8016feSHugh Delaney; CHECK-FTZ: { 15078f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 15088f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 15098f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 15108f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_param_2]; 15118f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_param_1]; 15128f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_param_0]; 15138f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 15148f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 15158f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 15168f8016feSHugh Delaney; 15178f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_no_nans( 15188f8016feSHugh Delaney; CHECK-SM70: { 15198f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 1520310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 15218f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<31>; 15228f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<11>; 15238f8016feSHugh Delaney; CHECK-SM70-EMPTY: 15248f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 15258f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_param_0]; 15268f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_param_1]; 15278f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_param_2]; 15288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 15298f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 15308f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 15318f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1532310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1533310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 15348f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 15358f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1536310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1537310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 15388f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 15398f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 15408f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 15418f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 15428f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 15438f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 15448f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 15458f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 15468f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 15478f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 1548310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 15498f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 15508f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 15518f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1552310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 15538f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 15548f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1555310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 15568f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 15578f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 15588f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 15598f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 15608f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 15618f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 15628f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 15638f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 15648f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 15658f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 1566310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 15678f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 15688f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 15698f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 15708f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 15718f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 15728f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 1573310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 1574310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 1575310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 15768f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 15778f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 15788f8016feSHugh Delaney %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 15798f8016feSHugh Delaney %2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0> 15808f8016feSHugh Delaney %3 = select nsz <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 15818f8016feSHugh Delaney ret <2 x bfloat> %3 15828f8016feSHugh Delaney} 15838f8016feSHugh Delaney 15848f8016feSHugh Delaney; FMA_relu shouldn't be selected if the FMA operation has multiple uses 15858f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 15868f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( 15878f8016feSHugh Delaney; CHECK: { 1588*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b32 %r<9>; 15898f8016feSHugh Delaney; CHECK-EMPTY: 15908f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 15918f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2]; 15928f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1]; 15938f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0]; 15948f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 1595*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r5, 1065369472; 1596*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r6, 1088438496; 1597*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r7, %r4, %r5, %r6; 1598*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r8, %r7, %r5, %r4; 1599*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b32 [func_retval0], %r8; 16008f8016feSHugh Delaney; CHECK-NEXT: ret; 16018f8016feSHugh Delaney; 16028f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( 16038f8016feSHugh Delaney; CHECK-FTZ: { 1604310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 16058f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<14>; 16068f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<9>; 16078f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 16088f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 16098f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2]; 16108f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1]; 16118f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0]; 16128f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 16138f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4; 16144b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs2; 16158f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; 16168f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r6; 16178f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f40E00000; 1618310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; 16194b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs1; 16208f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; 16218f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r8; 16228f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f4, %f3, 0f40E00000; 1623310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; 1624310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs4; 16258f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; 16268f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f5, %r10; 16278f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f3; 1628310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs3; 16298f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; 16308f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f7, %r12; 16318f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.rn.ftz.f32 %f8, %f7, %f1; 16324b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r13, %f8, %f6; 16338f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r13; 16348f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 16358f8016feSHugh Delaney; 16368f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( 16378f8016feSHugh Delaney; CHECK-SM70: { 16388f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<7>; 1639310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 1640310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<57>; 16418f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<17>; 16428f8016feSHugh Delaney; CHECK-SM70-EMPTY: 16438f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 16448f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0]; 16458f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1]; 16468f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2]; 16478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 16488f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; 16498f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 16508f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1651310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1652310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; 16538f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 16548f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1655310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1656310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; 16578f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 16588f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 16598f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 16608f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 16618f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 16628f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 16638f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 16648f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 16658f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 16668f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 16678f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; 16688f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 16698f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1670310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; 16718f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 16728f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1673310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; 16748f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 16758f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 16768f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 16778f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 16788f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 16798f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 16808f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 16818f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 16828f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 16838f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 16848f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536; 16858f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 16868f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f10, %f9, 0f40E00000; 16878f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r29, %f10; 16888f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1; 16898f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29; 16908f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767; 16918f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10; 16928f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304; 16938f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3; 16948f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536; 16958f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f11, %r35; 16968f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f12, %f11, 0f40E00000; 16978f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 16988f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 16998f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 17008f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 17018f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12; 17028f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 17038f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4; 17048f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r42, %r41, -65536; 17058f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f13, %r42; 17068f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f14, %f13, %f11; 17078f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r43, %f14; 17088f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r44, %r43, 16, 1; 17098f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r45, %r44, %r43; 17108f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r46, %r45, 32767; 17118f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f14, %f14; 17128f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304; 17138f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5; 17148f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536; 17158f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f15, %r49; 17168f8016feSHugh Delaney; CHECK-SM70-NEXT: add.rn.f32 %f16, %f15, %f9; 17178f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r50, %f16; 17188f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r51, %r50, 16, 1; 17198f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r52, %r51, %r50; 17208f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r53, %r52, 32767; 17218f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f16, %f16; 17228f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304; 17238f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6; 1724a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U; 17258f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56; 17268f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 17278f8016feSHugh Delaney %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 17288f8016feSHugh Delaney %2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0> 17298f8016feSHugh Delaney %3 = select nsz <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 17308f8016feSHugh Delaney %4 = fadd <2 x bfloat> %1, <bfloat 7.0, bfloat 7.0> 17318f8016feSHugh Delaney %5 = fadd <2 x bfloat> %4, %1 17328f8016feSHugh Delaney ret <2 x bfloat> %5 17338f8016feSHugh Delaney} 17348f8016feSHugh Delaney 17358f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 17368f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_maxnum_no_nans( 17378f8016feSHugh Delaney; CHECK: { 17388f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 17398f8016feSHugh Delaney; CHECK-EMPTY: 17408f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 17418f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_2]; 17428f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_maxnum_no_nans_param_1]; 17438f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_0]; 17448f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 17458f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 17468f8016feSHugh Delaney; CHECK-NEXT: ret; 17478f8016feSHugh Delaney; 17488f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_maxnum_no_nans( 17498f8016feSHugh Delaney; CHECK-FTZ: { 17508f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 17518f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 17528f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 17538f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_2]; 17548f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_maxnum_no_nans_param_1]; 17558f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_0]; 17568f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 17578f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 17588f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 17598f8016feSHugh Delaney; 17608f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans( 17618f8016feSHugh Delaney; CHECK-SM70: { 17628f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 1763310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 1764310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<43>; 17658f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<13>; 17668f8016feSHugh Delaney; CHECK-SM70-EMPTY: 17678f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 17688f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_0]; 17698f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_maxnum_no_nans_param_1]; 17708f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_2]; 17718f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 17728f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 17738f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 17748f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1775310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1776310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 17778f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 17788f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1779310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1780310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 17818f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 17828f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 17838f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 17848f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 17858f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 17868f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 17878f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 17888f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 17898f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 17908f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 17918f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 17928f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 17938f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1794310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 17958f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 17968f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1797310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 17988f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 17998f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 18008f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 18018f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 18028f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 18038f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 18048f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 18058f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 18068f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 18078f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 18088f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536; 18098f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 18108f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f10, %f9, 0f00000000; 18118f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r29, %f10; 18128f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1; 18138f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29; 18148f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767; 18158f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10; 18168f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304; 18178f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3; 18188f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536; 18198f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f11, %r35; 18208f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000; 18218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 18228f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 18238f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 18248f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 18258f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12; 18268f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 18278f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4; 1828a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U; 18298f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42; 18308f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 18318f8016feSHugh Delaney %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 18328f8016feSHugh Delaney %2 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) 18338f8016feSHugh Delaney ret <2 x bfloat> %2 18348f8016feSHugh Delaney} 1835