18f8016feSHugh Delaney; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s 3b279f6b0SFangrui Song; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 48f8016feSHugh Delaney 58f8016feSHugh Delaney; Using FTZ should emit fma.ftz.relu for f16, not for bf16 6b279f6b0SFangrui Song; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ 7b279f6b0SFangrui Song; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 88f8016feSHugh Delaney 98f8016feSHugh Delaney; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu 10b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70 118f8016feSHugh Delaney 128f8016feSHugh Delaneydefine half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 { 138f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_no_nans( 148f8016feSHugh Delaney; CHECK: { 158f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 168f8016feSHugh Delaney; CHECK-EMPTY: 178f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 188f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 198f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 208f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 218f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 228f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 238f8016feSHugh Delaney; CHECK-NEXT: ret; 248f8016feSHugh Delaney; 258f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans( 268f8016feSHugh Delaney; CHECK-FTZ: { 278f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 288f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 298f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 308f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 318f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 328f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 338f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 348f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 358f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 368f8016feSHugh Delaney; 378f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_no_nans( 388f8016feSHugh Delaney; CHECK-SM70: { 398f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 408f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 418f8016feSHugh Delaney; CHECK-SM70-EMPTY: 428f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 438f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 448f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 458f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 468f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 488f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 498f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 508f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 518f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 528f8016feSHugh Delaney %1 = fmul half %a, %b 538f8016feSHugh Delaney %2 = fadd half %1, %c 548f8016feSHugh Delaney %3 = fcmp ogt half %2, 0.0 558f8016feSHugh Delaney %4 = select i1 %3, half %2, half 0.0 568f8016feSHugh Delaney ret half %4 578f8016feSHugh Delaney} 588f8016feSHugh Delaney 598f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 608f8016feSHugh Delaneydefine half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 { 618f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 628f8016feSHugh Delaney; CHECK: { 638f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<10>; 648f8016feSHugh Delaney; CHECK-EMPTY: 658f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 668f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 678f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 688f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 698f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 708f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 718f8016feSHugh Delaney; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5; 728f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs7, 0x4700; 738f8016feSHugh Delaney; CHECK-NEXT: add.f16 %rs8, %rs4, %rs7; 748f8016feSHugh Delaney; CHECK-NEXT: add.f16 %rs9, %rs6, %rs8; 758f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs9; 768f8016feSHugh Delaney; CHECK-NEXT: ret; 778f8016feSHugh Delaney; 788f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 798f8016feSHugh Delaney; CHECK-FTZ: { 808f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<10>; 818f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 828f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 838f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 848f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 858f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 868f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 878f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 888f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16 %rs6, %rs4, %rs5; 898f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs7, 0x4700; 908f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f16 %rs8, %rs4, %rs7; 918f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f16 %rs9, %rs6, %rs8; 928f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs9; 938f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 948f8016feSHugh Delaney; 958f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 968f8016feSHugh Delaney; CHECK-SM70: { 978f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 988f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<10>; 998f8016feSHugh Delaney; CHECK-SM70-EMPTY: 1008f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 1018f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 1028f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 1038f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 1048f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 1058f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 1068f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 1078f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 1088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs7, 0x4700; 1098f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f16 %rs8, %rs4, %rs7; 1108f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f16 %rs9, %rs6, %rs8; 1118f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs9; 1128f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 1138f8016feSHugh Delaney %1 = fmul half %a, %b 1148f8016feSHugh Delaney %2 = fadd half %1, %c 1158f8016feSHugh Delaney %3 = fcmp ogt half %2, 0.0 1168f8016feSHugh Delaney %4 = select i1 %3, half %2, half 0.0 1178f8016feSHugh Delaney %5 = fadd half %2, 7.0 1188f8016feSHugh Delaney %6 = fadd half %4, %5 1198f8016feSHugh Delaney ret half %6 1208f8016feSHugh Delaney} 1218f8016feSHugh Delaney 1228f8016feSHugh Delaneydefine half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 { 1238f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans( 1248f8016feSHugh Delaney; CHECK: { 1258f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<7>; 1268f8016feSHugh Delaney; CHECK-EMPTY: 1278f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 1288f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 1298f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 1308f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 1318f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 1328f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 1338f8016feSHugh Delaney; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5; 1348f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; 1358f8016feSHugh Delaney; CHECK-NEXT: ret; 1368f8016feSHugh Delaney; 1378f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe_with_nans( 1388f8016feSHugh Delaney; CHECK-FTZ: { 1398f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 1408f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 1418f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 1428f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 1438f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 1448f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 1458f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 1468f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 1478f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16 %rs6, %rs4, %rs5; 1488f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; 1498f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 1508f8016feSHugh Delaney; 1518f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_unsafe_with_nans( 1528f8016feSHugh Delaney; CHECK-SM70: { 1538f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<2>; 1548f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 1558f8016feSHugh Delaney; CHECK-SM70-EMPTY: 1568f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 1578f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 1588f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 1598f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 1608f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 1618f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 1628f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 1638f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 1648f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 1658f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 1668f8016feSHugh Delaney %1 = fmul half %a, %b 1678f8016feSHugh Delaney %2 = fadd half %1, %c 1688f8016feSHugh Delaney %3 = fcmp ogt half %2, 0.0 1698f8016feSHugh Delaney %4 = select i1 %3, half %2, half 0.0 1708f8016feSHugh Delaney ret half %4 1718f8016feSHugh Delaney} 1728f8016feSHugh Delaney 1738f8016feSHugh Delaneydefine half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 { 1748f8016feSHugh Delaney; CHECK-LABEL: fma_f16_expanded_maxnum_no_nans( 1758f8016feSHugh Delaney; CHECK: { 1768f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 1778f8016feSHugh Delaney; CHECK-EMPTY: 1788f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 1798f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 1808f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 1818f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 1828f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 1838f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 1848f8016feSHugh Delaney; CHECK-NEXT: ret; 1858f8016feSHugh Delaney; 1868f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_no_nans( 1878f8016feSHugh Delaney; CHECK-FTZ: { 1888f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 1898f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 1908f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 1918f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 1928f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 1938f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 1948f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 1958f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 1968f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 1978f8016feSHugh Delaney; 1988f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans( 1998f8016feSHugh Delaney; CHECK-SM70: { 2008f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<6>; 2018f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<3>; 2028f8016feSHugh Delaney; CHECK-SM70-EMPTY: 2038f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 2048f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 2058f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 2068f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 2078f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 2088f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4; 2098f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 2108f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2; 2118f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; 2128f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 2138f8016feSHugh Delaney %1 = fmul half %a, %b 2148f8016feSHugh Delaney %2 = fadd half %1, %c 2158f8016feSHugh Delaney %3 = call half @llvm.maxnum.f16(half %2, half 0.0) 2168f8016feSHugh Delaney ret half %3 2178f8016feSHugh Delaney} 2188f8016feSHugh Delaney 2198f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 { 2208f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans( 2218f8016feSHugh Delaney; CHECK: { 2228f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<7>; 2238f8016feSHugh Delaney; CHECK-EMPTY: 2248f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 2258f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0]; 2268f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1]; 2278f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2]; 2288f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 2298f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 2308f8016feSHugh Delaney; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5; 2318f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; 2328f8016feSHugh Delaney; CHECK-NEXT: ret; 2338f8016feSHugh Delaney; 2348f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe_with_nans( 2358f8016feSHugh Delaney; CHECK-FTZ: { 2368f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 2378f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 2388f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 2398f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0]; 2408f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1]; 2418f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2]; 2428f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 2438f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 2448f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5; 2458f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; 2468f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 2478f8016feSHugh Delaney; 2488f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans( 2498f8016feSHugh Delaney; CHECK-SM70: { 2508f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 251310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 2528f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<14>; 2538f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<6>; 2548f8016feSHugh Delaney; CHECK-SM70-EMPTY: 2558f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 2568f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2]; 2578f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 2588f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 2598f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1]; 2608f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 2618f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 2628f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0]; 2638f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 2648f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 2658f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 2668f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 2678f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 2688f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 2698f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 2708f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 2718f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 2728f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 2738f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 2748f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 2758f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 2768f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 277310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 278310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 2798f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 2808f8016feSHugh Delaney %1 = fmul bfloat %a, %b 2818f8016feSHugh Delaney %2 = fadd bfloat %1, %c 2828f8016feSHugh Delaney %3 = fcmp ogt bfloat %2, 0.0 2838f8016feSHugh Delaney %4 = select i1 %3, bfloat %2, bfloat 0.0 2848f8016feSHugh Delaney ret bfloat %4 2858f8016feSHugh Delaney} 2868f8016feSHugh Delaney 2878f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { 2888f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_no_nans( 2898f8016feSHugh Delaney; CHECK: { 2908f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 2918f8016feSHugh Delaney; CHECK-EMPTY: 2928f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 2938f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 2948f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 2958f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 2968f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 2978f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 2988f8016feSHugh Delaney; CHECK-NEXT: ret; 2998f8016feSHugh Delaney; 3008f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans( 3018f8016feSHugh Delaney; CHECK-FTZ: { 3028f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 3038f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 3048f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 3058f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 3068f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 3078f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 3088f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 3098f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 3108f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 3118f8016feSHugh Delaney; 3128f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans( 3138f8016feSHugh Delaney; CHECK-SM70: { 3148f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 315310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 3168f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<14>; 3178f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<6>; 3188f8016feSHugh Delaney; CHECK-SM70-EMPTY: 3198f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 3208f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2]; 3218f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 3228f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 3238f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1]; 3248f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 3258f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 3268f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0]; 3278f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 3288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 3298f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 3308f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 3318f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 3328f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 3338f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 3348f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 3358f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 3368f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 3378f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 3388f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 3398f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 3408f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 341310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 342310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 3438f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 3448f8016feSHugh Delaney %1 = fmul bfloat %a, %b 3458f8016feSHugh Delaney %2 = fadd bfloat %1, %c 3468f8016feSHugh Delaney %3 = fcmp ogt bfloat %2, 0.0 3478f8016feSHugh Delaney %4 = select i1 %3, bfloat %2, bfloat 0.0 3488f8016feSHugh Delaney ret bfloat %4 3498f8016feSHugh Delaney} 3508f8016feSHugh Delaney 3518f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 3528f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 { 3538f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 3548f8016feSHugh Delaney; CHECK: { 355*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b16 %rs<11>; 3568f8016feSHugh Delaney; CHECK-EMPTY: 3578f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 3588f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 3598f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 3608f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 3618f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 3628f8016feSHugh Delaney; CHECK-NEXT: mov.b16 %rs5, 0x0000; 3638f8016feSHugh Delaney; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5; 364*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs7, 0x3F80; 365*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b16 %rs8, 0x40E0; 366*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs9, %rs4, %rs7, %rs8; 367*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16 %rs10, %rs6, %rs7, %rs9; 368*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b16 [func_retval0], %rs10; 3698f8016feSHugh Delaney; CHECK-NEXT: ret; 3708f8016feSHugh Delaney; 3718f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 3728f8016feSHugh Delaney; CHECK-FTZ: { 373310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; 3748f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 3758f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<6>; 3768f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 3778f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 3788f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 3798f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 3808f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 3818f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 3828f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 3838f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5; 3848f8016feSHugh Delaney; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4; 3858f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; 3868f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; 3878f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; 388310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %f2; 3898f8016feSHugh Delaney; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; 3908f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; 3918f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; 392310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs7; 3938f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; 3948f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f4, %r6; 3958f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f5, %f3, %f4; 396310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f5; 397310e7987SAlex MacLean; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; 3988f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 3998f8016feSHugh Delaney; 4008f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 4018f8016feSHugh Delaney; CHECK-SM70: { 4028f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 403310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<4>; 4048f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<29>; 4058f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<10>; 4068f8016feSHugh Delaney; CHECK-SM70-EMPTY: 4078f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 4088f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 4098f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 4108f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 4118f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 4128f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 4138f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 4148f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 4158f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 4168f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 4178f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 4188f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 4198f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 4208f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 4218f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 4228f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 4238f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 4248f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 4258f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 4268f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 4278f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 4288f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 429310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 4308f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f6, %f5, 0f40E00000; 4318f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 4328f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 4338f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 4348f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 4358f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f6, %f6; 4368f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 4378f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3; 438310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2; 4398f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 4408f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 4418f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536; 4428f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f8, %r22; 4438f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f9, %f7, %f8; 4448f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r23, %f9; 4458f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1; 4468f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23; 4478f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767; 4488f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9; 4498f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304; 4508f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4; 451310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; } 452310e7987SAlex MacLean; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; 4538f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 4548f8016feSHugh Delaney %1 = fmul bfloat %a, %b 4558f8016feSHugh Delaney %2 = fadd bfloat %1, %c 4568f8016feSHugh Delaney %3 = fcmp ogt bfloat %2, 0.0 4578f8016feSHugh Delaney %4 = select i1 %3, bfloat %2, bfloat 0.0 4588f8016feSHugh Delaney %5 = fadd bfloat %2, 7.0 4598f8016feSHugh Delaney %6 = fadd bfloat %4, %5 4608f8016feSHugh Delaney ret bfloat %6 4618f8016feSHugh Delaney} 4628f8016feSHugh Delaney 4638f8016feSHugh Delaneydefine bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { 4648f8016feSHugh Delaney; CHECK-LABEL: fma_bf16_expanded_maxnum_no_nans( 4658f8016feSHugh Delaney; CHECK: { 4668f8016feSHugh Delaney; CHECK-NEXT: .reg .b16 %rs<5>; 4678f8016feSHugh Delaney; CHECK-EMPTY: 4688f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 4698f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 4708f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 4718f8016feSHugh Delaney; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 4728f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 4738f8016feSHugh Delaney; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 4748f8016feSHugh Delaney; CHECK-NEXT: ret; 4758f8016feSHugh Delaney; 4768f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_no_nans( 4778f8016feSHugh Delaney; CHECK-FTZ: { 4788f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 4798f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 4808f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 4818f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 4828f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 4838f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 4848f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 4858f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 4868f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 4878f8016feSHugh Delaney; 4888f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans( 4898f8016feSHugh Delaney; CHECK-SM70: { 4908f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 491310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<2>; 4928f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<20>; 4938f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<7>; 4948f8016feSHugh Delaney; CHECK-SM70-EMPTY: 4958f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 4968f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; 4978f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 4988f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 4998f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; 5008f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 5018f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 5028f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; 5038f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 5048f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 5058f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 5068f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 5078f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 5088f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 5098f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 5108f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 5118f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 5128f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 5138f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 5148f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 5158f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000; 5168f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 5178f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 5188f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 5198f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 5208f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 5218f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 5228f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2; 5238f8016feSHugh Delaney; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; } 5248f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1; 5258f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 5268f8016feSHugh Delaney %1 = fmul bfloat %a, %b 5278f8016feSHugh Delaney %2 = fadd bfloat %1, %c 5288f8016feSHugh Delaney %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0) 5298f8016feSHugh Delaney ret bfloat %3 5308f8016feSHugh Delaney} 5318f8016feSHugh Delaney 5328f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 5338f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_no_nans( 5348f8016feSHugh Delaney; CHECK: { 5358f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 5368f8016feSHugh Delaney; CHECK-EMPTY: 5378f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 5388f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 5398f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 5408f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 5418f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 5428f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 5438f8016feSHugh Delaney; CHECK-NEXT: ret; 5448f8016feSHugh Delaney; 5458f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans( 5468f8016feSHugh Delaney; CHECK-FTZ: { 5478f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 5488f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 5498f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 5508f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 5518f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 5528f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 5538f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 5548f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 5558f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 5568f8016feSHugh Delaney; 5578f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans( 5588f8016feSHugh Delaney; CHECK-SM70: { 5598f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 5608f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 5618f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<7>; 5628f8016feSHugh Delaney; CHECK-SM70-EMPTY: 5638f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 5648f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 5658f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 5668f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 5678f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 5688f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 5698f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 5708f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 5718f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 5728f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 5738f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 5748f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 5758f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 5768f8016feSHugh Delaney %1 = fmul <2 x half> %a, %b 5778f8016feSHugh Delaney %2 = fadd <2 x half> %1, %c 5788f8016feSHugh Delaney %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 5798f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 5808f8016feSHugh Delaney ret <2 x half> %4 5818f8016feSHugh Delaney} 5828f8016feSHugh Delaney 5838f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 5848f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 5858f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 5868f8016feSHugh Delaney; CHECK: { 5878f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<10>; 5888f8016feSHugh Delaney; CHECK-EMPTY: 5898f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 5908f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 5918f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 5928f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 5938f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 5948f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 5958f8016feSHugh Delaney; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 5968f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r7, 1191200512; 5978f8016feSHugh Delaney; CHECK-NEXT: add.f16x2 %r8, %r4, %r7; 5988f8016feSHugh Delaney; CHECK-NEXT: add.f16x2 %r9, %r6, %r8; 5998f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r9; 6008f8016feSHugh Delaney; CHECK-NEXT: ret; 6018f8016feSHugh Delaney; 6028f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 6038f8016feSHugh Delaney; CHECK-FTZ: { 6048f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<10>; 6058f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 6068f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 6078f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 6088f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 6098f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 6108f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 6118f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 6128f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 6138f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r7, 1191200512; 6148f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f16x2 %r8, %r4, %r7; 6158f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f16x2 %r9, %r6, %r8; 6168f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r9; 6178f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 6188f8016feSHugh Delaney; 6198f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 6208f8016feSHugh Delaney; CHECK-SM70: { 6218f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 6228f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 6238f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<10>; 6248f8016feSHugh Delaney; CHECK-SM70-EMPTY: 6258f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 6268f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 6278f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 6288f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 6298f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 6308f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 6318f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 6328f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 6338f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 6348f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 6358f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 6368f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r7, 1191200512; 6378f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f16x2 %r8, %r4, %r7; 6388f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f16x2 %r9, %r6, %r8; 6398f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r9; 6408f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 6418f8016feSHugh Delaney %1 = fmul <2 x half> %a, %b 6428f8016feSHugh Delaney %2 = fadd <2 x half> %1, %c 6438f8016feSHugh Delaney %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 6448f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 6458f8016feSHugh Delaney %5 = fadd <2 x half> %2, <half 7.0, half 7.0> 6468f8016feSHugh Delaney %6 = fadd <2 x half> %4, %5 6478f8016feSHugh Delaney ret <2 x half> %6 6488f8016feSHugh Delaney} 6498f8016feSHugh Delaney 6508f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { 6518f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans( 6528f8016feSHugh Delaney; CHECK: { 6538f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<7>; 6548f8016feSHugh Delaney; CHECK-EMPTY: 6558f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 6568f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 6578f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 6588f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 6598f8016feSHugh Delaney; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 6608f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 6618f8016feSHugh Delaney; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 6628f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r6; 6638f8016feSHugh Delaney; CHECK-NEXT: ret; 6648f8016feSHugh Delaney; 6658f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_unsafe_with_nans( 6668f8016feSHugh Delaney; CHECK-FTZ: { 6678f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 6688f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 6698f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 6708f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 6718f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 6728f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 6738f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 6748f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 6758f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 6768f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r6; 6778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 6788f8016feSHugh Delaney; 6798f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_unsafe_with_nans( 6808f8016feSHugh Delaney; CHECK-SM70: { 6818f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<3>; 6828f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 6838f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<7>; 6848f8016feSHugh Delaney; CHECK-SM70-EMPTY: 6858f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 6868f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 6878f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 6888f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 6898f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 6908f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, 0; 6918f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 6928f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 6938f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 6948f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 6958f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 6968f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 6978f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 6988f8016feSHugh Delaney %1 = fmul <2 x half> %a, %b 6998f8016feSHugh Delaney %2 = fadd <2 x half> %1, %c 7008f8016feSHugh Delaney %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 7018f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 7028f8016feSHugh Delaney ret <2 x half> %4 7038f8016feSHugh Delaney} 7048f8016feSHugh Delaney 7058f8016feSHugh Delaneydefine <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 7068f8016feSHugh Delaney; CHECK-LABEL: fma_f16x2_expanded_maxnum_no_nans( 7078f8016feSHugh Delaney; CHECK: { 7088f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 7098f8016feSHugh Delaney; CHECK-EMPTY: 7108f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 7118f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 7128f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 7138f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 7148f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 7158f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 7168f8016feSHugh Delaney; CHECK-NEXT: ret; 7178f8016feSHugh Delaney; 7188f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_f16x2_expanded_maxnum_no_nans( 7198f8016feSHugh Delaney; CHECK-FTZ: { 7208f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 7218f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 7228f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 7238f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 7248f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 7258f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 7268f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 7278f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 7288f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 7298f8016feSHugh Delaney; 7308f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans( 7318f8016feSHugh Delaney; CHECK-SM70: { 7328f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 7338f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<6>; 7348f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<5>; 7358f8016feSHugh Delaney; CHECK-SM70-EMPTY: 7368f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 7378f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 7388f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 7398f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 7408f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 7418f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 7428f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs2; 7438f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 7448f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs3, %f2; 7458f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.f32.f16 %f3, %rs1; 7468f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f4, %f3, 0f00000000; 7478f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs4, %f4; 7488f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r5, {%rs4, %rs3}; 7498f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r5; 7508f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 7518f8016feSHugh Delaney %1 = fmul <2 x half> %a, %b 7528f8016feSHugh Delaney %2 = fadd <2 x half> %1, %c 7538f8016feSHugh Delaney %3 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>) 7548f8016feSHugh Delaney ret <2 x half> %3 7558f8016feSHugh Delaney} 7568f8016feSHugh Delaney 7578f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 { 7588f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 7598f8016feSHugh Delaney; CHECK: { 7608f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<7>; 7618f8016feSHugh Delaney; CHECK-EMPTY: 7628f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 7638f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 7648f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 7658f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 7668f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 7678f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 7688f8016feSHugh Delaney; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5; 7698f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r6; 7708f8016feSHugh Delaney; CHECK-NEXT: ret; 7718f8016feSHugh Delaney; 7728f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 7738f8016feSHugh Delaney; CHECK-FTZ: { 7748f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 7758f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 7768f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 7778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 7788f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 7798f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 7808f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 7818f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 7828f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5; 7838f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r6; 7848f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 7858f8016feSHugh Delaney; 7868f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 7878f8016feSHugh Delaney; CHECK-SM70: { 7888f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 789310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 7908f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<31>; 7918f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<11>; 7928f8016feSHugh Delaney; CHECK-SM70-EMPTY: 7938f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 7948f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 7958f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 7968f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 7978f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 7988f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 7998f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 8008f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 801310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 802310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 8038f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 8048f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 805310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 806310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 8078f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 8088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 8098f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 8108f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 8118f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 8128f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 8138f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 8148f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 8158f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 8168f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 817310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 8188f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 8198f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 8208f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 821310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 8228f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 8238f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 824310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 8258f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 8268f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 8278f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 8288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 8298f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 8308f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 8318f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 8328f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 8338f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 8348f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 835310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 8368f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 8378f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 8388f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 8398f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 8408f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 8418f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 842310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 843310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 844310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 8458f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 8468f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 8478f8016feSHugh Delaney %1 = fmul <2 x bfloat> %a, %b 8488f8016feSHugh Delaney %2 = fadd <2 x bfloat> %1, %c 8498f8016feSHugh Delaney %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 8508f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 8518f8016feSHugh Delaney ret <2 x bfloat> %4 8528f8016feSHugh Delaney} 8538f8016feSHugh Delaney 8548f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 8558f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_no_nans( 8568f8016feSHugh Delaney; CHECK: { 8578f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 8588f8016feSHugh Delaney; CHECK-EMPTY: 8598f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 8608f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 8618f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 8628f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 8638f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 8648f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 8658f8016feSHugh Delaney; CHECK-NEXT: ret; 8668f8016feSHugh Delaney; 8678f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans( 8688f8016feSHugh Delaney; CHECK-FTZ: { 8698f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 8708f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 8718f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 8728f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 8738f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 8748f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 8758f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 8768f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 8778f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 8788f8016feSHugh Delaney; 8798f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans( 8808f8016feSHugh Delaney; CHECK-SM70: { 8818f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 882310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 8838f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .b32 %r<31>; 8848f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<11>; 8858f8016feSHugh Delaney; CHECK-SM70-EMPTY: 8868f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 8878f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0]; 8888f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 8898f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2]; 8908f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 8918f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 8928f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 8938f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 894310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 895310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 8968f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 8978f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 898310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 899310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 9008f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 9018f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 9028f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 9038f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 9048f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 9058f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 9068f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 9078f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 9088f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 9098f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 910310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 9118f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 9128f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 9138f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 914310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 9158f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 9168f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 917310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 9188f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 9198f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 9208f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 9218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 9228f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 9238f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 9248f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 9258f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 9268f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 9278f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 928310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 9298f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 9308f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 9318f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 9328f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 9338f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 9348f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 935310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 936310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 937310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 9388f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 9398f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 9408f8016feSHugh Delaney %1 = fmul <2 x bfloat> %a, %b 9418f8016feSHugh Delaney %2 = fadd <2 x bfloat> %1, %c 9428f8016feSHugh Delaney %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 9438f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 9448f8016feSHugh Delaney ret <2 x bfloat> %4 9458f8016feSHugh Delaney} 9468f8016feSHugh Delaney 9478f8016feSHugh Delaney; FMA relu shouldn't be selected if the FMA operation has multiple uses 9488f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 9498f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 9508f8016feSHugh Delaney; CHECK: { 951*5e5fd0e6Speterbell10; CHECK-NEXT: .reg .b32 %r<11>; 9528f8016feSHugh Delaney; CHECK-EMPTY: 9538f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 9548f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 9558f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 9568f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 9578f8016feSHugh Delaney; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 9588f8016feSHugh Delaney; CHECK-NEXT: mov.b32 %r5, 0; 9598f8016feSHugh Delaney; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5; 960*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r7, 1065369472; 961*5e5fd0e6Speterbell10; CHECK-NEXT: mov.b32 %r8, 1088438496; 962*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r9, %r4, %r7, %r8; 963*5e5fd0e6Speterbell10; CHECK-NEXT: fma.rn.bf16x2 %r10, %r6, %r7, %r9; 964*5e5fd0e6Speterbell10; CHECK-NEXT: st.param.b32 [func_retval0], %r10; 9658f8016feSHugh Delaney; CHECK-NEXT: ret; 9668f8016feSHugh Delaney; 9678f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 9688f8016feSHugh Delaney; CHECK-FTZ: { 969310e7987SAlex MacLean; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 9708f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<20>; 9718f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .f32 %f<11>; 9728f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 9738f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 9748f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 9758f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 9768f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 9778f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 9788f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 9798f8016feSHugh Delaney; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5; 9808f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4; 9814b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs2; 9828f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; 9838f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f1, %r8; 9848f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; 985310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; 9864b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1; 9878f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; 9888f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f3, %r10; 9898f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f4, %f3, 0f40E00000; 990310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; 991310e7987SAlex MacLean; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6; 992310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs5; 9938f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; 9948f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f5, %r12; 995310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4; 9968f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16; 9978f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f6, %r14; 9988f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f7, %f5, %f6; 999310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs6; 10008f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16; 10018f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f8, %r16; 1002310e7987SAlex MacLean; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3; 10038f8016feSHugh Delaney; CHECK-FTZ-NEXT: shl.b32 %r18, %r17, 16; 10048f8016feSHugh Delaney; CHECK-FTZ-NEXT: mov.b32 %f9, %r18; 10058f8016feSHugh Delaney; CHECK-FTZ-NEXT: add.ftz.f32 %f10, %f8, %f9; 10064b24ab4bSAlex MacLean; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r19, %f10, %f7; 10078f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r19; 10088f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 10098f8016feSHugh Delaney; 10108f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 10118f8016feSHugh Delaney; CHECK-SM70: { 10128f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<9>; 1013310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 1014310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<61>; 10158f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<19>; 10168f8016feSHugh Delaney; CHECK-SM70-EMPTY: 10178f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 10188f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 10198f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 10208f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 10218f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 10228f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; 10238f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 10248f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1025310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1026310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; 10278f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 10288f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1029310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1030310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; 10318f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 10328f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 10338f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 10348f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 10358f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 10368f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 10378f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 10388f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 10398f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 10408f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 1041310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 10428f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; 10438f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 10448f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1045310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; 10468f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 10478f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1048310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; 10498f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 10508f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 10518f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 10528f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 10538f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 10548f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 10558f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 10568f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 10578f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 10588f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 1059310e7987SAlex MacLean; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 10608f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 10618f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 10628f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 10638f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 10648f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 10658f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 1066310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 1067310e7987SAlex MacLean; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 10688f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f11, %f10, 0f40E00000; 10698f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r30, %f11; 10708f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1; 10718f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r30; 10728f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767; 10738f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f11, %f11; 10748f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r34, %r30, 4194304; 10758f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p5; 10768f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f12, %f9, 0f40E00000; 10778f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 10788f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 10798f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 10808f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 10818f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f12, %f12; 10828f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 10838f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6; 1084310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10; 10858f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16; 10868f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f13, %r43; 10878f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536; 10888f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f14, %r44; 10898f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f15, %f13, %f14; 10908f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r45, %f15; 10918f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r46, %r45, 16, 1; 10928f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r47, %r46, %r45; 10938f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r48, %r47, 32767; 10948f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15; 10958f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304; 10968f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7; 1097310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9; 10988f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16; 10998f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f16, %r52; 11008f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536; 11018f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f17, %r53; 11028f8016feSHugh Delaney; CHECK-SM70-NEXT: add.f32 %f18, %f16, %f17; 11038f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r54, %f18; 11048f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r55, %r54, 16, 1; 11058f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r56, %r55, %r54; 11068f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r57, %r56, 32767; 11078f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18; 11088f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304; 11098f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8; 1110a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U; 11118f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60; 11128f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 11138f8016feSHugh Delaney %1 = fmul <2 x bfloat> %a, %b 11148f8016feSHugh Delaney %2 = fadd <2 x bfloat> %1, %c 11158f8016feSHugh Delaney %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 11168f8016feSHugh Delaney %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 11178f8016feSHugh Delaney %5 = fadd <2 x bfloat> %2, <bfloat 7.0, bfloat 7.0> 11188f8016feSHugh Delaney %6 = fadd <2 x bfloat> %4, %5 11198f8016feSHugh Delaney ret <2 x bfloat> %6 11208f8016feSHugh Delaney} 11218f8016feSHugh Delaney 11228f8016feSHugh Delaneydefine <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 11238f8016feSHugh Delaney; CHECK-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 11248f8016feSHugh Delaney; CHECK: { 11258f8016feSHugh Delaney; CHECK-NEXT: .reg .b32 %r<5>; 11268f8016feSHugh Delaney; CHECK-EMPTY: 11278f8016feSHugh Delaney; CHECK-NEXT: // %bb.0: 11288f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 11298f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 11308f8016feSHugh Delaney; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 11318f8016feSHugh Delaney; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 11328f8016feSHugh Delaney; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 11338f8016feSHugh Delaney; CHECK-NEXT: ret; 11348f8016feSHugh Delaney; 11358f8016feSHugh Delaney; CHECK-FTZ-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 11368f8016feSHugh Delaney; CHECK-FTZ: { 11378f8016feSHugh Delaney; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 11388f8016feSHugh Delaney; CHECK-FTZ-EMPTY: 11398f8016feSHugh Delaney; CHECK-FTZ-NEXT: // %bb.0: 11408f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 11418f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 11428f8016feSHugh Delaney; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 11438f8016feSHugh Delaney; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 11448f8016feSHugh Delaney; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 11458f8016feSHugh Delaney; CHECK-FTZ-NEXT: ret; 11468f8016feSHugh Delaney; 11478f8016feSHugh Delaney; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 11488f8016feSHugh Delaney; CHECK-SM70: { 11498f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .pred %p<5>; 1150310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 1151310e7987SAlex MacLean; CHECK-SM70-NEXT: .reg .b32 %r<43>; 11528f8016feSHugh Delaney; CHECK-SM70-NEXT: .reg .f32 %f<13>; 11538f8016feSHugh Delaney; CHECK-SM70-EMPTY: 11548f8016feSHugh Delaney; CHECK-SM70-NEXT: // %bb.0: 11558f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 11568f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 11578f8016feSHugh Delaney; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 11588f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 11598f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 11608f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 11618f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1162310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1163310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 11648f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 11658f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1166310e7987SAlex MacLean; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1167310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 11688f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 11698f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 11708f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 11718f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 11728f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 11738f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 11748f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 11758f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 11768f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 11778f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 11788f8016feSHugh Delaney; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 11798f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 11808f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1181310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 11828f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 11838f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1184310e7987SAlex MacLean; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 11858f8016feSHugh Delaney; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 11868f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 11878f8016feSHugh Delaney; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 11888f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 11898f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 11908f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 11918f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 11928f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 11938f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 11948f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 11958f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536; 11968f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 11978f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f10, %f9, 0f00000000; 11988f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r29, %f10; 11998f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1; 12008f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29; 12018f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767; 12028f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10; 12038f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304; 12048f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3; 12058f8016feSHugh Delaney; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536; 12068f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %f11, %r35; 12078f8016feSHugh Delaney; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000; 12088f8016feSHugh Delaney; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 12098f8016feSHugh Delaney; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 12108f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 12118f8016feSHugh Delaney; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 12128f8016feSHugh Delaney; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12; 12138f8016feSHugh Delaney; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 12148f8016feSHugh Delaney; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4; 1215a1f5fe8cSFraser Cormack; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U; 12168f8016feSHugh Delaney; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42; 12178f8016feSHugh Delaney; CHECK-SM70-NEXT: ret; 12188f8016feSHugh Delaney %1 = fmul <2 x bfloat> %a, %b 12198f8016feSHugh Delaney %2 = fadd <2 x bfloat> %1, %c 12208f8016feSHugh Delaney %3 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) 12218f8016feSHugh Delaney ret <2 x bfloat> %3 12228f8016feSHugh Delaney} 12238f8016feSHugh Delaney 12248f8016feSHugh Delaneyattributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } 12258f8016feSHugh Delaneyattributes #1 = { "unsafe-fp-math"="true" } 1226