1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s 3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 4 5; Using FTZ should emit fma.ftz.relu for f16, not for bf16 6; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ 7; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} 8 9; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu 10; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70 11 12define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 { 13; CHECK-LABEL: fma_f16_expanded_no_nans( 14; CHECK: { 15; CHECK-NEXT: .reg .b16 %rs<5>; 16; CHECK-EMPTY: 17; CHECK-NEXT: // %bb.0: 18; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 19; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 20; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 21; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 22; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 23; CHECK-NEXT: ret; 24; 25; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans( 26; CHECK-FTZ: { 27; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 28; CHECK-FTZ-EMPTY: 29; CHECK-FTZ-NEXT: // %bb.0: 30; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 31; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 32; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 33; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 34; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 35; CHECK-FTZ-NEXT: ret; 36; 37; CHECK-SM70-LABEL: fma_f16_expanded_no_nans( 38; CHECK-SM70: { 39; CHECK-SM70-NEXT: .reg .pred %p<2>; 40; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 41; CHECK-SM70-EMPTY: 42; CHECK-SM70-NEXT: // %bb.0: 43; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0]; 44; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1]; 45; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2]; 46; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 47; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 48; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 49; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 50; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 51; CHECK-SM70-NEXT: ret; 52 %1 = fmul half %a, %b 53 %2 = fadd half %1, %c 54 %3 = fcmp ogt half %2, 0.0 55 %4 = select i1 %3, half %2, half 0.0 56 ret half %4 57} 58 59; FMA relu shouldn't be selected if the FMA operation has multiple uses 60define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 { 61; CHECK-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 62; CHECK: { 63; CHECK-NEXT: .reg .b16 %rs<10>; 64; CHECK-EMPTY: 65; CHECK-NEXT: // %bb.0: 66; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 67; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 68; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 69; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 70; CHECK-NEXT: mov.b16 %rs5, 0x0000; 71; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5; 72; CHECK-NEXT: mov.b16 %rs7, 0x4700; 73; CHECK-NEXT: add.f16 %rs8, %rs4, %rs7; 74; CHECK-NEXT: add.f16 %rs9, %rs6, %rs8; 75; CHECK-NEXT: st.param.b16 [func_retval0], %rs9; 76; CHECK-NEXT: ret; 77; 78; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 79; CHECK-FTZ: { 80; CHECK-FTZ-NEXT: .reg .b16 %rs<10>; 81; CHECK-FTZ-EMPTY: 82; CHECK-FTZ-NEXT: // %bb.0: 83; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 84; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 85; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 86; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 87; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 88; CHECK-FTZ-NEXT: max.ftz.f16 %rs6, %rs4, %rs5; 89; CHECK-FTZ-NEXT: mov.b16 %rs7, 0x4700; 90; CHECK-FTZ-NEXT: add.ftz.f16 %rs8, %rs4, %rs7; 91; CHECK-FTZ-NEXT: add.ftz.f16 %rs9, %rs6, %rs8; 92; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs9; 93; CHECK-FTZ-NEXT: ret; 94; 95; CHECK-SM70-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma( 96; CHECK-SM70: { 97; CHECK-SM70-NEXT: .reg .pred %p<2>; 98; CHECK-SM70-NEXT: .reg .b16 %rs<10>; 99; CHECK-SM70-EMPTY: 100; CHECK-SM70-NEXT: // %bb.0: 101; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0]; 102; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1]; 103; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2]; 104; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 105; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 106; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 107; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 108; CHECK-SM70-NEXT: mov.b16 %rs7, 0x4700; 109; CHECK-SM70-NEXT: add.f16 %rs8, %rs4, %rs7; 110; CHECK-SM70-NEXT: add.f16 %rs9, %rs6, %rs8; 111; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs9; 112; CHECK-SM70-NEXT: ret; 113 %1 = fmul half %a, %b 114 %2 = fadd half %1, %c 115 %3 = fcmp ogt half %2, 0.0 116 %4 = select i1 %3, half %2, half 0.0 117 %5 = fadd half %2, 7.0 118 %6 = fadd half %4, %5 119 ret half %6 120} 121 122define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 { 123; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans( 124; CHECK: { 125; CHECK-NEXT: .reg .b16 %rs<7>; 126; CHECK-EMPTY: 127; CHECK-NEXT: // %bb.0: 128; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 129; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 130; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 131; CHECK-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 132; CHECK-NEXT: mov.b16 %rs5, 0x0000; 133; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5; 134; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; 135; CHECK-NEXT: ret; 136; 137; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe_with_nans( 138; CHECK-FTZ: { 139; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 140; CHECK-FTZ-EMPTY: 141; CHECK-FTZ-NEXT: // %bb.0: 142; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 143; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 144; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 145; CHECK-FTZ-NEXT: fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3; 146; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 147; CHECK-FTZ-NEXT: max.ftz.f16 %rs6, %rs4, %rs5; 148; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; 149; CHECK-FTZ-NEXT: ret; 150; 151; CHECK-SM70-LABEL: fma_f16_expanded_unsafe_with_nans( 152; CHECK-SM70: { 153; CHECK-SM70-NEXT: .reg .pred %p<2>; 154; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 155; CHECK-SM70-EMPTY: 156; CHECK-SM70-NEXT: // %bb.0: 157; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0]; 158; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1]; 159; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2]; 160; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 161; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000; 162; CHECK-SM70-NEXT: setp.gt.f16 %p1, %rs4, %rs5; 163; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; 164; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; 165; CHECK-SM70-NEXT: ret; 166 %1 = fmul half %a, %b 167 %2 = fadd half %1, %c 168 %3 = fcmp ogt half %2, 0.0 169 %4 = select i1 %3, half %2, half 0.0 170 ret half %4 171} 172 173define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 { 174; CHECK-LABEL: fma_f16_expanded_maxnum_no_nans( 175; CHECK: { 176; CHECK-NEXT: .reg .b16 %rs<5>; 177; CHECK-EMPTY: 178; CHECK-NEXT: // %bb.0: 179; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 180; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 181; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 182; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3; 183; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 184; CHECK-NEXT: ret; 185; 186; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_no_nans( 187; CHECK-FTZ: { 188; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 189; CHECK-FTZ-EMPTY: 190; CHECK-FTZ-NEXT: // %bb.0: 191; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 192; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 193; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 194; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3; 195; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 196; CHECK-FTZ-NEXT: ret; 197; 198; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans( 199; CHECK-SM70: { 200; CHECK-SM70-NEXT: .reg .b16 %rs<6>; 201; CHECK-SM70-NEXT: .reg .f32 %f<3>; 202; CHECK-SM70-EMPTY: 203; CHECK-SM70-NEXT: // %bb.0: 204; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0]; 205; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1]; 206; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2]; 207; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3; 208; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4; 209; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 210; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2; 211; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; 212; CHECK-SM70-NEXT: ret; 213 %1 = fmul half %a, %b 214 %2 = fadd half %1, %c 215 %3 = call half @llvm.maxnum.f16(half %2, half 0.0) 216 ret half %3 217} 218 219define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 { 220; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans( 221; CHECK: { 222; CHECK-NEXT: .reg .b16 %rs<7>; 223; CHECK-EMPTY: 224; CHECK-NEXT: // %bb.0: 225; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0]; 226; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1]; 227; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2]; 228; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 229; CHECK-NEXT: mov.b16 %rs5, 0x0000; 230; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5; 231; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; 232; CHECK-NEXT: ret; 233; 234; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe_with_nans( 235; CHECK-FTZ: { 236; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 237; CHECK-FTZ-EMPTY: 238; CHECK-FTZ-NEXT: // %bb.0: 239; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0]; 240; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1]; 241; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2]; 242; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 243; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 244; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5; 245; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6; 246; CHECK-FTZ-NEXT: ret; 247; 248; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans( 249; CHECK-SM70: { 250; CHECK-SM70-NEXT: .reg .pred %p<3>; 251; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 252; CHECK-SM70-NEXT: .reg .b32 %r<14>; 253; CHECK-SM70-NEXT: .reg .f32 %f<6>; 254; CHECK-SM70-EMPTY: 255; CHECK-SM70-NEXT: // %bb.0: 256; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2]; 257; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 258; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 259; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1]; 260; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 261; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 262; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0]; 263; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 264; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 265; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 266; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 267; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 268; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 269; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 270; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 271; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 272; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 273; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 274; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 275; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 276; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 277; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 278; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 279; CHECK-SM70-NEXT: ret; 280 %1 = fmul bfloat %a, %b 281 %2 = fadd bfloat %1, %c 282 %3 = fcmp ogt bfloat %2, 0.0 283 %4 = select i1 %3, bfloat %2, bfloat 0.0 284 ret bfloat %4 285} 286 287define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { 288; CHECK-LABEL: fma_bf16_expanded_no_nans( 289; CHECK: { 290; CHECK-NEXT: .reg .b16 %rs<5>; 291; CHECK-EMPTY: 292; CHECK-NEXT: // %bb.0: 293; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 294; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 295; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 296; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 297; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 298; CHECK-NEXT: ret; 299; 300; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans( 301; CHECK-FTZ: { 302; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 303; CHECK-FTZ-EMPTY: 304; CHECK-FTZ-NEXT: // %bb.0: 305; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0]; 306; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1]; 307; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2]; 308; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 309; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 310; CHECK-FTZ-NEXT: ret; 311; 312; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans( 313; CHECK-SM70: { 314; CHECK-SM70-NEXT: .reg .pred %p<3>; 315; CHECK-SM70-NEXT: .reg .b16 %rs<3>; 316; CHECK-SM70-NEXT: .reg .b32 %r<14>; 317; CHECK-SM70-NEXT: .reg .f32 %f<6>; 318; CHECK-SM70-EMPTY: 319; CHECK-SM70-NEXT: // %bb.0: 320; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2]; 321; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 322; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 323; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1]; 324; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 325; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 326; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0]; 327; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 328; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 329; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 330; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 331; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 332; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 333; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 334; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 335; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 336; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 337; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 338; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 339; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 340; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 341; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 342; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; 343; CHECK-SM70-NEXT: ret; 344 %1 = fmul bfloat %a, %b 345 %2 = fadd bfloat %1, %c 346 %3 = fcmp ogt bfloat %2, 0.0 347 %4 = select i1 %3, bfloat %2, bfloat 0.0 348 ret bfloat %4 349} 350 351; FMA relu shouldn't be selected if the FMA operation has multiple uses 352define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 { 353; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 354; CHECK: { 355; CHECK-NEXT: .reg .b16 %rs<11>; 356; CHECK-EMPTY: 357; CHECK-NEXT: // %bb.0: 358; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 359; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 360; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 361; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 362; CHECK-NEXT: mov.b16 %rs5, 0x0000; 363; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5; 364; CHECK-NEXT: mov.b16 %rs7, 0x3F80; 365; CHECK-NEXT: mov.b16 %rs8, 0x40E0; 366; CHECK-NEXT: fma.rn.bf16 %rs9, %rs4, %rs7, %rs8; 367; CHECK-NEXT: fma.rn.bf16 %rs10, %rs6, %rs7, %rs9; 368; CHECK-NEXT: st.param.b16 [func_retval0], %rs10; 369; CHECK-NEXT: ret; 370; 371; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 372; CHECK-FTZ: { 373; CHECK-FTZ-NEXT: .reg .b16 %rs<9>; 374; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 375; CHECK-FTZ-NEXT: .reg .f32 %f<6>; 376; CHECK-FTZ-EMPTY: 377; CHECK-FTZ-NEXT: // %bb.0: 378; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 379; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 380; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 381; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3; 382; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000; 383; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5; 384; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4; 385; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16; 386; CHECK-FTZ-NEXT: mov.b32 %f1, %r2; 387; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; 388; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %f2; 389; CHECK-FTZ-NEXT: cvt.u32.u16 %r3, %rs6; 390; CHECK-FTZ-NEXT: shl.b32 %r4, %r3, 16; 391; CHECK-FTZ-NEXT: mov.b32 %f3, %r4; 392; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs7; 393; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16; 394; CHECK-FTZ-NEXT: mov.b32 %f4, %r6; 395; CHECK-FTZ-NEXT: add.ftz.f32 %f5, %f3, %f4; 396; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %f5; 397; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8; 398; CHECK-FTZ-NEXT: ret; 399; 400; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma( 401; CHECK-SM70: { 402; CHECK-SM70-NEXT: .reg .pred %p<5>; 403; CHECK-SM70-NEXT: .reg .b16 %rs<4>; 404; CHECK-SM70-NEXT: .reg .b32 %r<29>; 405; CHECK-SM70-NEXT: .reg .f32 %f<10>; 406; CHECK-SM70-EMPTY: 407; CHECK-SM70-NEXT: // %bb.0: 408; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; 409; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 410; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 411; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; 412; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 413; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 414; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; 415; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 416; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 417; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 418; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 419; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 420; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 421; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 422; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 423; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 424; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 425; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; } 426; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 427; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 428; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000; 429; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; 430; CHECK-SM70-NEXT: add.f32 %f6, %f5, 0f40E00000; 431; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 432; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 433; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 434; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 435; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f6, %f6; 436; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 437; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3; 438; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2; 439; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 440; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 441; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536; 442; CHECK-SM70-NEXT: mov.b32 %f8, %r22; 443; CHECK-SM70-NEXT: add.f32 %f9, %f7, %f8; 444; CHECK-SM70-NEXT: mov.b32 %r23, %f9; 445; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1; 446; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23; 447; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767; 448; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9; 449; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304; 450; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4; 451; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; } 452; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3; 453; CHECK-SM70-NEXT: ret; 454 %1 = fmul bfloat %a, %b 455 %2 = fadd bfloat %1, %c 456 %3 = fcmp ogt bfloat %2, 0.0 457 %4 = select i1 %3, bfloat %2, bfloat 0.0 458 %5 = fadd bfloat %2, 7.0 459 %6 = fadd bfloat %4, %5 460 ret bfloat %6 461} 462 463define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { 464; CHECK-LABEL: fma_bf16_expanded_maxnum_no_nans( 465; CHECK: { 466; CHECK-NEXT: .reg .b16 %rs<5>; 467; CHECK-EMPTY: 468; CHECK-NEXT: // %bb.0: 469; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 470; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 471; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 472; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 473; CHECK-NEXT: st.param.b16 [func_retval0], %rs4; 474; CHECK-NEXT: ret; 475; 476; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_no_nans( 477; CHECK-FTZ: { 478; CHECK-FTZ-NEXT: .reg .b16 %rs<5>; 479; CHECK-FTZ-EMPTY: 480; CHECK-FTZ-NEXT: // %bb.0: 481; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0]; 482; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1]; 483; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2]; 484; CHECK-FTZ-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3; 485; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4; 486; CHECK-FTZ-NEXT: ret; 487; 488; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans( 489; CHECK-SM70: { 490; CHECK-SM70-NEXT: .reg .pred %p<3>; 491; CHECK-SM70-NEXT: .reg .b16 %rs<2>; 492; CHECK-SM70-NEXT: .reg .b32 %r<20>; 493; CHECK-SM70-NEXT: .reg .f32 %f<7>; 494; CHECK-SM70-EMPTY: 495; CHECK-SM70-NEXT: // %bb.0: 496; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; 497; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; 498; CHECK-SM70-NEXT: mov.b32 %f1, %r2; 499; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; 500; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; 501; CHECK-SM70-NEXT: mov.b32 %f2, %r4; 502; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; 503; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; 504; CHECK-SM70-NEXT: mov.b32 %f3, %r6; 505; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 506; CHECK-SM70-NEXT: mov.b32 %r7, %f4; 507; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 508; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7; 509; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767; 510; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 511; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304; 512; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 513; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536; 514; CHECK-SM70-NEXT: mov.b32 %f5, %r13; 515; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000; 516; CHECK-SM70-NEXT: mov.b32 %r14, %f6; 517; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1; 518; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14; 519; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767; 520; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 521; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304; 522; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2; 523; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; } 524; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1; 525; CHECK-SM70-NEXT: ret; 526 %1 = fmul bfloat %a, %b 527 %2 = fadd bfloat %1, %c 528 %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0) 529 ret bfloat %3 530} 531 532define <2 x half> @fma_f16x2_expanded_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 533; CHECK-LABEL: fma_f16x2_expanded_no_nans( 534; CHECK: { 535; CHECK-NEXT: .reg .b32 %r<5>; 536; CHECK-EMPTY: 537; CHECK-NEXT: // %bb.0: 538; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 539; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 540; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 541; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 542; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 543; CHECK-NEXT: ret; 544; 545; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans( 546; CHECK-FTZ: { 547; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 548; CHECK-FTZ-EMPTY: 549; CHECK-FTZ-NEXT: // %bb.0: 550; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 551; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 552; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 553; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 554; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 555; CHECK-FTZ-NEXT: ret; 556; 557; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans( 558; CHECK-SM70: { 559; CHECK-SM70-NEXT: .reg .pred %p<3>; 560; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 561; CHECK-SM70-NEXT: .reg .b32 %r<7>; 562; CHECK-SM70-EMPTY: 563; CHECK-SM70-NEXT: // %bb.0: 564; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2]; 565; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1]; 566; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0]; 567; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 568; CHECK-SM70-NEXT: mov.b32 %r5, 0; 569; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 570; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 571; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 572; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 573; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 574; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 575; CHECK-SM70-NEXT: ret; 576 %1 = fmul <2 x half> %a, %b 577 %2 = fadd <2 x half> %1, %c 578 %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 579 %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 580 ret <2 x half> %4 581} 582 583; FMA relu shouldn't be selected if the FMA operation has multiple uses 584define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 585; CHECK-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 586; CHECK: { 587; CHECK-NEXT: .reg .b32 %r<10>; 588; CHECK-EMPTY: 589; CHECK-NEXT: // %bb.0: 590; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 591; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 592; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 593; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 594; CHECK-NEXT: mov.b32 %r5, 0; 595; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 596; CHECK-NEXT: mov.b32 %r7, 1191200512; 597; CHECK-NEXT: add.f16x2 %r8, %r4, %r7; 598; CHECK-NEXT: add.f16x2 %r9, %r6, %r8; 599; CHECK-NEXT: st.param.b32 [func_retval0], %r9; 600; CHECK-NEXT: ret; 601; 602; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 603; CHECK-FTZ: { 604; CHECK-FTZ-NEXT: .reg .b32 %r<10>; 605; CHECK-FTZ-EMPTY: 606; CHECK-FTZ-NEXT: // %bb.0: 607; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 608; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 609; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 610; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 611; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 612; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 613; CHECK-FTZ-NEXT: mov.b32 %r7, 1191200512; 614; CHECK-FTZ-NEXT: add.ftz.f16x2 %r8, %r4, %r7; 615; CHECK-FTZ-NEXT: add.ftz.f16x2 %r9, %r6, %r8; 616; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r9; 617; CHECK-FTZ-NEXT: ret; 618; 619; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma( 620; CHECK-SM70: { 621; CHECK-SM70-NEXT: .reg .pred %p<3>; 622; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 623; CHECK-SM70-NEXT: .reg .b32 %r<10>; 624; CHECK-SM70-EMPTY: 625; CHECK-SM70-NEXT: // %bb.0: 626; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 627; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 628; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 629; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 630; CHECK-SM70-NEXT: mov.b32 %r5, 0; 631; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 632; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 633; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 634; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 635; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 636; CHECK-SM70-NEXT: mov.b32 %r7, 1191200512; 637; CHECK-SM70-NEXT: add.f16x2 %r8, %r4, %r7; 638; CHECK-SM70-NEXT: add.f16x2 %r9, %r6, %r8; 639; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r9; 640; CHECK-SM70-NEXT: ret; 641 %1 = fmul <2 x half> %a, %b 642 %2 = fadd <2 x half> %1, %c 643 %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 644 %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 645 %5 = fadd <2 x half> %2, <half 7.0, half 7.0> 646 %6 = fadd <2 x half> %4, %5 647 ret <2 x half> %6 648} 649 650define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { 651; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans( 652; CHECK: { 653; CHECK-NEXT: .reg .b32 %r<7>; 654; CHECK-EMPTY: 655; CHECK-NEXT: // %bb.0: 656; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 657; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 658; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 659; CHECK-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 660; CHECK-NEXT: mov.b32 %r5, 0; 661; CHECK-NEXT: max.f16x2 %r6, %r4, %r5; 662; CHECK-NEXT: st.param.b32 [func_retval0], %r6; 663; CHECK-NEXT: ret; 664; 665; CHECK-FTZ-LABEL: fma_f16x2_expanded_unsafe_with_nans( 666; CHECK-FTZ: { 667; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 668; CHECK-FTZ-EMPTY: 669; CHECK-FTZ-NEXT: // %bb.0: 670; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 671; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 672; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 673; CHECK-FTZ-NEXT: fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1; 674; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 675; CHECK-FTZ-NEXT: max.ftz.f16x2 %r6, %r4, %r5; 676; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r6; 677; CHECK-FTZ-NEXT: ret; 678; 679; CHECK-SM70-LABEL: fma_f16x2_expanded_unsafe_with_nans( 680; CHECK-SM70: { 681; CHECK-SM70-NEXT: .reg .pred %p<3>; 682; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 683; CHECK-SM70-NEXT: .reg .b32 %r<7>; 684; CHECK-SM70-EMPTY: 685; CHECK-SM70-NEXT: // %bb.0: 686; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2]; 687; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1]; 688; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0]; 689; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 690; CHECK-SM70-NEXT: mov.b32 %r5, 0; 691; CHECK-SM70-NEXT: setp.gt.f16x2 %p1|%p2, %r4, %r5; 692; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 693; CHECK-SM70-NEXT: selp.b16 %rs3, %rs2, 0x0000, %p2; 694; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; 695; CHECK-SM70-NEXT: mov.b32 %r6, {%rs4, %rs3}; 696; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r6; 697; CHECK-SM70-NEXT: ret; 698 %1 = fmul <2 x half> %a, %b 699 %2 = fadd <2 x half> %1, %c 700 %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0> 701 %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0> 702 ret <2 x half> %4 703} 704 705define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { 706; CHECK-LABEL: fma_f16x2_expanded_maxnum_no_nans( 707; CHECK: { 708; CHECK-NEXT: .reg .b32 %r<5>; 709; CHECK-EMPTY: 710; CHECK-NEXT: // %bb.0: 711; CHECK-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 712; CHECK-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 713; CHECK-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 714; CHECK-NEXT: fma.rn.relu.f16x2 %r4, %r3, %r2, %r1; 715; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 716; CHECK-NEXT: ret; 717; 718; CHECK-FTZ-LABEL: fma_f16x2_expanded_maxnum_no_nans( 719; CHECK-FTZ: { 720; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 721; CHECK-FTZ-EMPTY: 722; CHECK-FTZ-NEXT: // %bb.0: 723; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 724; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 725; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 726; CHECK-FTZ-NEXT: fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1; 727; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 728; CHECK-FTZ-NEXT: ret; 729; 730; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans( 731; CHECK-SM70: { 732; CHECK-SM70-NEXT: .reg .b16 %rs<5>; 733; CHECK-SM70-NEXT: .reg .b32 %r<6>; 734; CHECK-SM70-NEXT: .reg .f32 %f<5>; 735; CHECK-SM70-EMPTY: 736; CHECK-SM70-NEXT: // %bb.0: 737; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2]; 738; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1]; 739; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0]; 740; CHECK-SM70-NEXT: fma.rn.f16x2 %r4, %r3, %r2, %r1; 741; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; 742; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs2; 743; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000; 744; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs3, %f2; 745; CHECK-SM70-NEXT: cvt.f32.f16 %f3, %rs1; 746; CHECK-SM70-NEXT: max.f32 %f4, %f3, 0f00000000; 747; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs4, %f4; 748; CHECK-SM70-NEXT: mov.b32 %r5, {%rs4, %rs3}; 749; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r5; 750; CHECK-SM70-NEXT: ret; 751 %1 = fmul <2 x half> %a, %b 752 %2 = fadd <2 x half> %1, %c 753 %3 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>) 754 ret <2 x half> %3 755} 756 757define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 { 758; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 759; CHECK: { 760; CHECK-NEXT: .reg .b32 %r<7>; 761; CHECK-EMPTY: 762; CHECK-NEXT: // %bb.0: 763; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 764; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 765; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 766; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 767; CHECK-NEXT: mov.b32 %r5, 0; 768; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5; 769; CHECK-NEXT: st.param.b32 [func_retval0], %r6; 770; CHECK-NEXT: ret; 771; 772; CHECK-FTZ-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 773; CHECK-FTZ: { 774; CHECK-FTZ-NEXT: .reg .b32 %r<7>; 775; CHECK-FTZ-EMPTY: 776; CHECK-FTZ-NEXT: // %bb.0: 777; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 778; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 779; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 780; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 781; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 782; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5; 783; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r6; 784; CHECK-FTZ-NEXT: ret; 785; 786; CHECK-SM70-LABEL: fma_bf16x2_expanded_unsafe_with_nans( 787; CHECK-SM70: { 788; CHECK-SM70-NEXT: .reg .pred %p<5>; 789; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 790; CHECK-SM70-NEXT: .reg .b32 %r<31>; 791; CHECK-SM70-NEXT: .reg .f32 %f<11>; 792; CHECK-SM70-EMPTY: 793; CHECK-SM70-NEXT: // %bb.0: 794; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_0]; 795; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1]; 796; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_2]; 797; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 798; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 799; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 800; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 801; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 802; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 803; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 804; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 805; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 806; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 807; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 808; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 809; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 810; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 811; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 812; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 813; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 814; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 815; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 816; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 817; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 818; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 819; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 820; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 821; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 822; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 823; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 824; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 825; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 826; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 827; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 828; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 829; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 830; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 831; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 832; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 833; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 834; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 835; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 836; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 837; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 838; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 839; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 840; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 841; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 842; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 843; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 844; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 845; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 846; CHECK-SM70-NEXT: ret; 847 %1 = fmul <2 x bfloat> %a, %b 848 %2 = fadd <2 x bfloat> %1, %c 849 %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 850 %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 851 ret <2 x bfloat> %4 852} 853 854define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 855; CHECK-LABEL: fma_bf16x2_expanded_no_nans( 856; CHECK: { 857; CHECK-NEXT: .reg .b32 %r<5>; 858; CHECK-EMPTY: 859; CHECK-NEXT: // %bb.0: 860; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 861; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 862; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 863; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 864; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 865; CHECK-NEXT: ret; 866; 867; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans( 868; CHECK-FTZ: { 869; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 870; CHECK-FTZ-EMPTY: 871; CHECK-FTZ-NEXT: // %bb.0: 872; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2]; 873; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 874; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0]; 875; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 876; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 877; CHECK-FTZ-NEXT: ret; 878; 879; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans( 880; CHECK-SM70: { 881; CHECK-SM70-NEXT: .reg .pred %p<5>; 882; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 883; CHECK-SM70-NEXT: .reg .b32 %r<31>; 884; CHECK-SM70-NEXT: .reg .f32 %f<11>; 885; CHECK-SM70-EMPTY: 886; CHECK-SM70-NEXT: // %bb.0: 887; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0]; 888; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1]; 889; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2]; 890; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 891; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 892; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 893; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 894; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 895; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 896; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 897; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 898; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 899; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 900; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 901; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 902; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 903; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 904; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 905; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 906; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 907; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 908; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 909; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 910; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 911; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 912; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 913; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 914; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 915; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 916; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 917; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 918; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 919; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 920; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 921; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 922; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 923; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 924; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 925; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 926; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 927; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 928; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 929; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 930; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 931; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 932; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 933; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 934; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 935; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 936; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 937; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9}; 938; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30; 939; CHECK-SM70-NEXT: ret; 940 %1 = fmul <2 x bfloat> %a, %b 941 %2 = fadd <2 x bfloat> %1, %c 942 %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 943 %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 944 ret <2 x bfloat> %4 945} 946 947; FMA relu shouldn't be selected if the FMA operation has multiple uses 948define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 949; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 950; CHECK: { 951; CHECK-NEXT: .reg .b32 %r<11>; 952; CHECK-EMPTY: 953; CHECK-NEXT: // %bb.0: 954; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 955; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 956; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 957; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 958; CHECK-NEXT: mov.b32 %r5, 0; 959; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5; 960; CHECK-NEXT: mov.b32 %r7, 1065369472; 961; CHECK-NEXT: mov.b32 %r8, 1088438496; 962; CHECK-NEXT: fma.rn.bf16x2 %r9, %r4, %r7, %r8; 963; CHECK-NEXT: fma.rn.bf16x2 %r10, %r6, %r7, %r9; 964; CHECK-NEXT: st.param.b32 [func_retval0], %r10; 965; CHECK-NEXT: ret; 966; 967; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 968; CHECK-FTZ: { 969; CHECK-FTZ-NEXT: .reg .b16 %rs<7>; 970; CHECK-FTZ-NEXT: .reg .b32 %r<20>; 971; CHECK-FTZ-NEXT: .reg .f32 %f<11>; 972; CHECK-FTZ-EMPTY: 973; CHECK-FTZ-NEXT: // %bb.0: 974; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 975; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 976; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 977; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 978; CHECK-FTZ-NEXT: mov.b32 %r5, 0; 979; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5; 980; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4; 981; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs2; 982; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16; 983; CHECK-FTZ-NEXT: mov.b32 %f1, %r8; 984; CHECK-FTZ-NEXT: add.ftz.f32 %f2, %f1, 0f40E00000; 985; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f2; 986; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1; 987; CHECK-FTZ-NEXT: shl.b32 %r10, %r9, 16; 988; CHECK-FTZ-NEXT: mov.b32 %f3, %r10; 989; CHECK-FTZ-NEXT: add.ftz.f32 %f4, %f3, 0f40E00000; 990; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %f4; 991; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6; 992; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs5; 993; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16; 994; CHECK-FTZ-NEXT: mov.b32 %f5, %r12; 995; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4; 996; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16; 997; CHECK-FTZ-NEXT: mov.b32 %f6, %r14; 998; CHECK-FTZ-NEXT: add.ftz.f32 %f7, %f5, %f6; 999; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs6; 1000; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16; 1001; CHECK-FTZ-NEXT: mov.b32 %f8, %r16; 1002; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3; 1003; CHECK-FTZ-NEXT: shl.b32 %r18, %r17, 16; 1004; CHECK-FTZ-NEXT: mov.b32 %f9, %r18; 1005; CHECK-FTZ-NEXT: add.ftz.f32 %f10, %f8, %f9; 1006; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r19, %f10, %f7; 1007; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r19; 1008; CHECK-FTZ-NEXT: ret; 1009; 1010; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma( 1011; CHECK-SM70: { 1012; CHECK-SM70-NEXT: .reg .pred %p<9>; 1013; CHECK-SM70-NEXT: .reg .b16 %rs<11>; 1014; CHECK-SM70-NEXT: .reg .b32 %r<61>; 1015; CHECK-SM70-NEXT: .reg .f32 %f<19>; 1016; CHECK-SM70-EMPTY: 1017; CHECK-SM70-NEXT: // %bb.0: 1018; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0]; 1019; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1]; 1020; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2]; 1021; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 1022; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2; 1023; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 1024; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1025; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1026; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4; 1027; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 1028; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1029; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1030; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6; 1031; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 1032; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 1033; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 1034; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 1035; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 1036; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 1037; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 1038; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 1039; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 1040; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 1041; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; } 1042; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1; 1043; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 1044; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1045; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3; 1046; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 1047; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1048; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5; 1049; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 1050; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 1051; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 1052; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 1053; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 1054; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 1055; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 1056; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 1057; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 1058; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 1059; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; } 1060; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536; 1061; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 1062; CHECK-SM70-NEXT: setp.gt.f32 %p3, %f9, 0f00000000; 1063; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536; 1064; CHECK-SM70-NEXT: mov.b32 %f10, %r29; 1065; CHECK-SM70-NEXT: setp.gt.f32 %p4, %f10, 0f00000000; 1066; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4; 1067; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; 1068; CHECK-SM70-NEXT: add.f32 %f11, %f10, 0f40E00000; 1069; CHECK-SM70-NEXT: mov.b32 %r30, %f11; 1070; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1; 1071; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r30; 1072; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767; 1073; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f11, %f11; 1074; CHECK-SM70-NEXT: or.b32 %r34, %r30, 4194304; 1075; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p5; 1076; CHECK-SM70-NEXT: add.f32 %f12, %f9, 0f40E00000; 1077; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 1078; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 1079; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 1080; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 1081; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f12, %f12; 1082; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 1083; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6; 1084; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10; 1085; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16; 1086; CHECK-SM70-NEXT: mov.b32 %f13, %r43; 1087; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536; 1088; CHECK-SM70-NEXT: mov.b32 %f14, %r44; 1089; CHECK-SM70-NEXT: add.f32 %f15, %f13, %f14; 1090; CHECK-SM70-NEXT: mov.b32 %r45, %f15; 1091; CHECK-SM70-NEXT: bfe.u32 %r46, %r45, 16, 1; 1092; CHECK-SM70-NEXT: add.s32 %r47, %r46, %r45; 1093; CHECK-SM70-NEXT: add.s32 %r48, %r47, 32767; 1094; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15; 1095; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304; 1096; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7; 1097; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9; 1098; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16; 1099; CHECK-SM70-NEXT: mov.b32 %f16, %r52; 1100; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536; 1101; CHECK-SM70-NEXT: mov.b32 %f17, %r53; 1102; CHECK-SM70-NEXT: add.f32 %f18, %f16, %f17; 1103; CHECK-SM70-NEXT: mov.b32 %r54, %f18; 1104; CHECK-SM70-NEXT: bfe.u32 %r55, %r54, 16, 1; 1105; CHECK-SM70-NEXT: add.s32 %r56, %r55, %r54; 1106; CHECK-SM70-NEXT: add.s32 %r57, %r56, 32767; 1107; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18; 1108; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304; 1109; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8; 1110; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U; 1111; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60; 1112; CHECK-SM70-NEXT: ret; 1113 %1 = fmul <2 x bfloat> %a, %b 1114 %2 = fadd <2 x bfloat> %1, %c 1115 %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0> 1116 %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0> 1117 %5 = fadd <2 x bfloat> %2, <bfloat 7.0, bfloat 7.0> 1118 %6 = fadd <2 x bfloat> %4, %5 1119 ret <2 x bfloat> %6 1120} 1121 1122define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 1123; CHECK-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 1124; CHECK: { 1125; CHECK-NEXT: .reg .b32 %r<5>; 1126; CHECK-EMPTY: 1127; CHECK-NEXT: // %bb.0: 1128; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 1129; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 1130; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 1131; CHECK-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 1132; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 1133; CHECK-NEXT: ret; 1134; 1135; CHECK-FTZ-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 1136; CHECK-FTZ: { 1137; CHECK-FTZ-NEXT: .reg .b32 %r<5>; 1138; CHECK-FTZ-EMPTY: 1139; CHECK-FTZ-NEXT: // %bb.0: 1140; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 1141; CHECK-FTZ-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 1142; CHECK-FTZ-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 1143; CHECK-FTZ-NEXT: fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1; 1144; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r4; 1145; CHECK-FTZ-NEXT: ret; 1146; 1147; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans( 1148; CHECK-SM70: { 1149; CHECK-SM70-NEXT: .reg .pred %p<5>; 1150; CHECK-SM70-NEXT: .reg .b16 %rs<7>; 1151; CHECK-SM70-NEXT: .reg .b32 %r<43>; 1152; CHECK-SM70-NEXT: .reg .f32 %f<13>; 1153; CHECK-SM70-EMPTY: 1154; CHECK-SM70-NEXT: // %bb.0: 1155; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0]; 1156; CHECK-SM70-NEXT: ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1]; 1157; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2]; 1158; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; 1159; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1; 1160; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16; 1161; CHECK-SM70-NEXT: mov.b32 %f1, %r5; 1162; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 1163; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3; 1164; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16; 1165; CHECK-SM70-NEXT: mov.b32 %f2, %r7; 1166; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; 1167; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5; 1168; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16; 1169; CHECK-SM70-NEXT: mov.b32 %f3, %r9; 1170; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; 1171; CHECK-SM70-NEXT: mov.b32 %r10, %f4; 1172; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1; 1173; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10; 1174; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767; 1175; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4; 1176; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304; 1177; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1; 1178; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2; 1179; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16; 1180; CHECK-SM70-NEXT: mov.b32 %f5, %r17; 1181; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4; 1182; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16; 1183; CHECK-SM70-NEXT: mov.b32 %f6, %r19; 1184; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6; 1185; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16; 1186; CHECK-SM70-NEXT: mov.b32 %f7, %r21; 1187; CHECK-SM70-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; 1188; CHECK-SM70-NEXT: mov.b32 %r22, %f8; 1189; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1; 1190; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22; 1191; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767; 1192; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f8, %f8; 1193; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304; 1194; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2; 1195; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536; 1196; CHECK-SM70-NEXT: mov.b32 %f9, %r28; 1197; CHECK-SM70-NEXT: max.f32 %f10, %f9, 0f00000000; 1198; CHECK-SM70-NEXT: mov.b32 %r29, %f10; 1199; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1; 1200; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29; 1201; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767; 1202; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10; 1203; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304; 1204; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3; 1205; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536; 1206; CHECK-SM70-NEXT: mov.b32 %f11, %r35; 1207; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000; 1208; CHECK-SM70-NEXT: mov.b32 %r36, %f12; 1209; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1; 1210; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36; 1211; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767; 1212; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12; 1213; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304; 1214; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4; 1215; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U; 1216; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42; 1217; CHECK-SM70-NEXT: ret; 1218 %1 = fmul <2 x bfloat> %a, %b 1219 %2 = fadd <2 x bfloat> %1, %c 1220 %3 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) 1221 ret <2 x bfloat> %3 1222} 1223 1224attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } 1225attributes #1 = { "unsafe-fp-math"="true" } 1226