1; REQUIRES: asserts 2; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT 3; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST 4; Check latencies of vmul/vfma accumulate chains. 5 6define arm_aapcs_vfpcc float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { 7; CHECK: ********** MI Scheduling ********** 8; CHECK: Test1:%bb.0 9 10; CHECK: VMULS 11; > VMULS common latency = 5 12; CHECK: Latency : 5 13; CHECK: Successors: 14; CHECK: Data 15; > VMULS read-advanced latency to VMLAS = 0 16; CHECK-SAME: Latency=0 17 18; CHECK-DEFAULT: VMLAS 19; CHECK-FAST: VFMAS 20; > VMLAS common latency = 9 21; CHECK: Latency : 9 22; CHECK: Successors: 23; CHECK: Data 24; > VMLAS read-advanced latency to the next VMLAS = 4 25; CHECK-SAME: Latency=4 26 27; CHECK-DEFAULT: VMLAS 28; CHECK-FAST: VFMAS 29; CHECK: Latency : 9 30; CHECK: Successors: 31; CHECK: Data 32; > VMLAS not-optimized latency to VMOVRS = 9 33; CHECK-SAME: Latency=9 34 35; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS 36 %mul1 = fmul float %f1, %f2 37 %mul2 = fmul float %f3, %f4 38 %mul3 = fmul float %f5, %f6 39 %add1 = fadd float %mul1, %mul2 40 %add2 = fadd float %add1, %mul3 41 ret float %add2 42} 43 44; ASIMD form 45define arm_aapcs_vfpcc <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) { 46; CHECK: ********** MI Scheduling ********** 47; CHECK: Test2:%bb.0 48 49; CHECK: VMULfd 50; > VMULfd common latency = 5 51; CHECK: Latency : 5 52; CHECK: Successors: 53; CHECK: Data 54; VMULfd read-advanced latency to VMLAfd = 0 55; CHECK-SAME: Latency=0 56 57; CHECK-DEFAULT: VMLAfd 58; CHECK-FAST: VFMAfd 59; > VMLAfd common latency = 9 60; CHECK: Latency : 9 61; CHECK: Successors: 62; CHECK: Data 63; > VMLAfd read-advanced latency to the next VMLAfd = 4 64; CHECK-SAME: Latency=4 65 66; CHECK-DEFAULT: VMLAfd 67; CHECK-FAST: VFMAfd 68; CHECK: Latency : 9 69; CHECK: Successors: 70; CHECK: Data 71; > VMLAfd not-optimized latency to VMOVRRD = 9 72; CHECK-SAME: Latency=9 73 74; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS 75 %mul1 = fmul <2 x float> %f1, %f2 76 %mul2 = fmul <2 x float> %f3, %f4 77 %mul3 = fmul <2 x float> %f5, %f6 78 %add1 = fadd <2 x float> %mul1, %mul2 79 %add2 = fadd <2 x float> %add1, %mul3 80 ret <2 x float> %add2 81} 82 83define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { 84; CHECK: ********** MI Scheduling ********** 85; CHECK: Test3:%bb.0 86 87; CHECK: VMULS 88; > VMULS common latency = 5 89; CHECK: Latency : 5 90; CHECK: Successors: 91; CHECK: Data 92; > VMULS read-advanced latency to VMLSS = 0 93; CHECK-SAME: Latency=0 94 95; CHECK-DEFAULT: VMLSS 96; CHECK-FAST: VFNMSS 97; > VFNMSS common latency = 9 98; CHECK: Latency : 9 99; CHECK: Successors: 100; CHECK: Data 101; > VFNMSS read-advanced latency to the next VMLSS = 4 102; CHECK-SAME: Latency=4 103 104; CHECK-DEFAULT: VMLSS 105; CHECK-FAST: VFMSS 106; CHECK: Latency : 9 107; CHECK: Successors: 108; CHECK: Data 109; > VMLSS not-optimized latency to VMOVRS = 9 110; CHECK-SAME: Latency=9 111 112; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS 113 %mul1 = fmul float %f1, %f2 114 %mul2 = fmul float %f3, %f4 115 %mul3 = fmul float %f5, %f6 116 %sub1 = fsub float %mul1, %mul2 117 %sub2 = fsub float %sub1, %mul3 118 ret float %sub2 119} 120 121; ASIMD form 122define arm_aapcs_vfpcc <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) { 123; CHECK: ********** MI Scheduling ********** 124; CHECK: Test4:%bb.0 125 126; CHECK: VMULfd 127; > VMULfd common latency = 5 128; CHECK: Latency : 5 129; CHECK: Successors: 130; CHECK: Data 131; VMULfd read-advanced latency to VMLSfd = 0 132; CHECK-SAME: Latency=0 133 134; CHECK-DEFAULT: VMLSfd 135; CHECK-FAST: VFMSfd 136; > VMLSfd common latency = 9 137; CHECK: Latency : 9 138; CHECK: Successors: 139; CHECK: Data 140; > VMLSfd read-advanced latency to the next VMLSfd = 4 141; CHECK-SAME: Latency=4 142 143; CHECK-DEFAULT: VMLSfd 144; CHECK-FAST: VFMSfd 145; CHECK: Latency : 9 146; CHECK: Successors: 147; CHECK: Data 148; > VMLSfd not-optimized latency to VMOVRRD = 9 149; CHECK-SAME: Latency=9 150 151; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS 152 %mul1 = fmul <2 x float> %f1, %f2 153 %mul2 = fmul <2 x float> %f3, %f4 154 %mul3 = fmul <2 x float> %f5, %f6 155 %sub1 = fsub <2 x float> %mul1, %mul2 156 %sub2 = fsub <2 x float> %sub1, %mul3 157 ret <2 x float> %sub2 158} 159 160define arm_aapcs_vfpcc float @Test5(float %f1, float %f2, float %f3) { 161; CHECK: ********** MI Scheduling ********** 162; CHECK: Test5:%bb.0 163 164; CHECK-DEFAULT: VNMLS 165; CHECK-FAST: VFNMS 166; CHECK: Latency : 9 167; CHECK: Successors: 168; CHECK: Data 169; > VMLAS not-optimized latency to VMOVRS = 9 170; CHECK-SAME: Latency=9 171 172; f1 * f2 - f3 ==> VNMLS/VFNMS 173 %mul = fmul float %f1, %f2 174 %sub = fsub float %mul, %f3 175 ret float %sub 176} 177 178 179define arm_aapcs_vfpcc float @Test6(float %f1, float %f2, float %f3) { 180; CHECK: ********** MI Scheduling ********** 181; CHECK: Test6:%bb.0 182 183; CHECK-DEFAULT: VNMLA 184; CHECK-FAST: VFNMA 185; CHECK: Latency : 9 186; CHECK: Successors: 187; CHECK: Data 188; > VMLAS not-optimized latency to VMOVRS = 9 189; CHECK-SAME: Latency=9 190 191; f1 * f2 - f3 ==> VNMLA/VFNMA 192 %mul = fmul float %f1, %f2 193 %sub1 = fsub float -0.0, %mul 194 %sub2 = fsub float %sub1, %f2 195 ret float %sub2 196} 197