1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s 3 4target triple = "aarch64" 5 6%"struct.std::complex" = type { { double, double } } 7 8; Zero initialized reduction 9; 10; complex<double> x = 0.0 + 0.0i; 11; for (int i = 0; i < 100; ++i) 12; x += a[i] * b[i]; 13; 14define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { 15; CHECK-LABEL: complex_mul_v2f64: 16; CHECK: // %bb.0: // %entry 17; CHECK-NEXT: movi v0.2d, #0000000000000000 18; CHECK-NEXT: movi v1.2d, #0000000000000000 19; CHECK-NEXT: mov x8, xzr 20; CHECK-NEXT: .LBB0_1: // %vector.body 21; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 22; CHECK-NEXT: add x9, x0, x8 23; CHECK-NEXT: add x10, x1, x8 24; CHECK-NEXT: add x8, x8, #32 25; CHECK-NEXT: ldp q3, q2, [x9] 26; CHECK-NEXT: cmp x8, #1600 27; CHECK-NEXT: ldp q5, q4, [x10] 28; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0 29; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0 30; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90 31; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90 32; CHECK-NEXT: b.ne .LBB0_1 33; CHECK-NEXT: // %bb.2: // %middle.block 34; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d 35; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d 36; CHECK-NEXT: faddp d0, v0.2d 37; CHECK-NEXT: faddp d1, v2.2d 38; CHECK-NEXT: ret 39entry: 40 br label %vector.body 41 42vector.body: ; preds = %vector.body, %entry 43 %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ] 44 %vec.phi = phi <2 x double> [ zeroinitializer, %entry ], [ %7, %vector.body ] 45 %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %5, %vector.body ] 46 %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv 47 %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv 48 %wide.vec = load <4 x double>, ptr %scevgep, align 8 49 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2> 50 %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3> 51 %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8 52 %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2> 53 %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3> 54 %0 = fmul fast <2 x double> %strided.vec31, %strided.vec 55 %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28 56 %2 = fmul fast <2 x double> %strided.vec30, %strided.vec 57 %3 = fadd fast <2 x double> %2, %vec.phi27 58 %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28 59 %5 = fsub fast <2 x double> %3, %4 60 %6 = fadd fast <2 x double> %1, %vec.phi 61 %7 = fadd fast <2 x double> %6, %0 62 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32 63 %8 = icmp eq i64 %lsr.iv.next, 1600 64 br i1 %8, label %middle.block, label %vector.body 65 66middle.block: ; preds = %vector.body 67 %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5) 68 %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7) 69 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0 70 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1 71 ret %"struct.std::complex" %.fca.0.1.insert 72} 73 74; Fixed value initialized reduction 75; 76; complex<double> x = 2.0 + 1.0i; 77; for (int i = 0; i < 100; ++i) 78; x += a[i] * b[i]; 79; 80define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { 81; CHECK-LABEL: complex_mul_nonzero_init_v2f64: 82; CHECK: // %bb.0: // %entry 83; CHECK-NEXT: movi v0.2d, #0000000000000000 84; CHECK-NEXT: adrp x8, .LCPI1_0 85; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] 86; CHECK-NEXT: mov x8, xzr 87; CHECK-NEXT: .LBB1_1: // %vector.body 88; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 89; CHECK-NEXT: add x9, x0, x8 90; CHECK-NEXT: add x10, x1, x8 91; CHECK-NEXT: add x8, x8, #32 92; CHECK-NEXT: ldp q3, q2, [x9] 93; CHECK-NEXT: cmp x8, #1600 94; CHECK-NEXT: ldp q5, q4, [x10] 95; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0 96; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0 97; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90 98; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90 99; CHECK-NEXT: b.ne .LBB1_1 100; CHECK-NEXT: // %bb.2: // %middle.block 101; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d 102; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d 103; CHECK-NEXT: faddp d0, v0.2d 104; CHECK-NEXT: faddp d1, v2.2d 105; CHECK-NEXT: ret 106entry: 107 br label %vector.body 108 109vector.body: ; preds = %vector.body, %entry 110 %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ] 111 %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %7, %vector.body ] 112 %vec.phi27 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %5, %vector.body ] 113 %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv 114 %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv 115 %wide.vec = load <4 x double>, ptr %scevgep, align 8 116 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2> 117 %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3> 118 %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8 119 %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2> 120 %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3> 121 %0 = fmul fast <2 x double> %strided.vec31, %strided.vec 122 %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28 123 %2 = fmul fast <2 x double> %strided.vec30, %strided.vec 124 %3 = fadd fast <2 x double> %2, %vec.phi27 125 %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28 126 %5 = fsub fast <2 x double> %3, %4 127 %6 = fadd fast <2 x double> %1, %vec.phi 128 %7 = fadd fast <2 x double> %6, %0 129 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32 130 %8 = icmp eq i64 %lsr.iv.next, 1600 131 br i1 %8, label %middle.block, label %vector.body 132 133middle.block: ; preds = %vector.body 134 %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5) 135 %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7) 136 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0 137 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1 138 ret %"struct.std::complex" %.fca.0.1.insert 139} 140 141; Loop unrolled with factor 2 142; 143define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { 144; CHECK-LABEL: complex_mul_v2f64_unrolled: 145; CHECK: // %bb.0: // %entry 146; CHECK-NEXT: movi v0.2d, #0000000000000000 147; CHECK-NEXT: movi v1.2d, #0000000000000000 148; CHECK-NEXT: adrp x8, .LCPI2_0 149; CHECK-NEXT: movi v3.2d, #0000000000000000 150; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] 151; CHECK-NEXT: add x8, x0, #32 152; CHECK-NEXT: add x9, x1, #32 153; CHECK-NEXT: mov x10, #-100 // =0xffffffffffffff9c 154; CHECK-NEXT: .LBB2_1: // %vector.body 155; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 156; CHECK-NEXT: ldp q5, q4, [x8, #-32] 157; CHECK-NEXT: adds x10, x10, #4 158; CHECK-NEXT: ldp q7, q6, [x9, #-32] 159; CHECK-NEXT: ldp q17, q16, [x8], #64 160; CHECK-NEXT: ldp q19, q18, [x9], #64 161; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 162; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 163; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 164; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0 165; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90 166; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90 167; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90 168; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90 169; CHECK-NEXT: b.ne .LBB2_1 170; CHECK-NEXT: // %bb.2: // %middle.block 171; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d 172; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d 173; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d 174; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d 175; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d 176; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d 177; CHECK-NEXT: faddp d0, v0.2d 178; CHECK-NEXT: faddp d1, v1.2d 179; CHECK-NEXT: ret 180entry: 181 %scevgep = getelementptr i8, ptr %a, i64 32 182 %scevgep49 = getelementptr i8, ptr %b, i64 32 183 br label %vector.body 184 185vector.body: ; preds = %vector.body, %entry 186 %lsr.iv54 = phi i64 [ %lsr.iv.next, %vector.body ], [ 100, %entry ] 187 %lsr.iv50 = phi ptr [ %scevgep51, %vector.body ], [ %scevgep49, %entry ] 188 %lsr.iv = phi ptr [ %scevgep48, %vector.body ], [ %scevgep, %entry ] 189 %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %14, %vector.body ] 190 %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %15, %vector.body ] 191 %vec.phi28 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %10, %vector.body ] 192 %vec.phi29 = phi <2 x double> [ zeroinitializer, %entry ], [ %11, %vector.body ] 193 %scevgep52 = getelementptr i8, ptr %lsr.iv, i64 -32 194 %scevgep53 = getelementptr i8, ptr %lsr.iv50, i64 -32 195 %wide.vec = load <4 x double>, ptr %scevgep52, align 8 196 %wide.vec30 = load <4 x double>, ptr %lsr.iv, align 8 197 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2> 198 %strided.vec31 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 0, i32 2> 199 %strided.vec32 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3> 200 %strided.vec33 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 1, i32 3> 201 %wide.vec34 = load <4 x double>, ptr %scevgep53, align 8 202 %wide.vec35 = load <4 x double>, ptr %lsr.iv50, align 8 203 %strided.vec36 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 0, i32 2> 204 %strided.vec37 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 0, i32 2> 205 %strided.vec38 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 1, i32 3> 206 %strided.vec39 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 1, i32 3> 207 %0 = fmul fast <2 x double> %strided.vec38, %strided.vec 208 %1 = fmul fast <2 x double> %strided.vec39, %strided.vec31 209 %2 = fmul fast <2 x double> %strided.vec36, %strided.vec32 210 %3 = fmul fast <2 x double> %strided.vec37, %strided.vec33 211 %4 = fmul fast <2 x double> %strided.vec36, %strided.vec 212 %5 = fmul fast <2 x double> %strided.vec37, %strided.vec31 213 %6 = fadd fast <2 x double> %4, %vec.phi28 214 %7 = fadd fast <2 x double> %5, %vec.phi29 215 %8 = fmul fast <2 x double> %strided.vec38, %strided.vec32 216 %9 = fmul fast <2 x double> %strided.vec39, %strided.vec33 217 %10 = fsub fast <2 x double> %6, %8 218 %11 = fsub fast <2 x double> %7, %9 219 %12 = fadd fast <2 x double> %2, %vec.phi 220 %13 = fadd fast <2 x double> %3, %vec.phi27 221 %14 = fadd fast <2 x double> %12, %0 222 %15 = fadd fast <2 x double> %13, %1 223 %scevgep48 = getelementptr i8, ptr %lsr.iv, i64 64 224 %scevgep51 = getelementptr i8, ptr %lsr.iv50, i64 64 225 %lsr.iv.next = add nsw i64 %lsr.iv54, -4 226 %16 = icmp eq i64 %lsr.iv.next, 0 227 br i1 %16, label %middle.block, label %vector.body 228 229middle.block: ; preds = %vector.body 230 %bin.rdx40 = fadd fast <2 x double> %11, %10 231 %17 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx40) 232 %bin.rdx = fadd fast <2 x double> %15, %14 233 %18 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx) 234 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %17, 0, 0 235 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %18, 0, 1 236 ret %"struct.std::complex" %.fca.0.1.insert 237} 238 239; The reduced bug from D153355. Shows that reduction was detected where it did not exist. 240define void @incorrect_reduction_pattern(i1 %exitcond.not) { 241; CHECK-LABEL: incorrect_reduction_pattern: 242; CHECK: // %bb.0: // %entry 243; CHECK-NEXT: .LBB3_1: // %for.body 244; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 245; CHECK-NEXT: tbz w0, #0, .LBB3_1 246; CHECK-NEXT: // %bb.2: // %for.end.loopexit 247; CHECK-NEXT: ret 248entry: 249 br label %for.body 250 251for.body: ; preds = %for.body, %entry 252 %vec_r = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_r, %for.body ] 253 %vec_i = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_i, %for.body ] 254 %add = fadd <4 x float> %vec_r, %vec_i 255 %lane_r = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer 256 %lane_i = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer 257 br i1 %exitcond.not, label %for.end.loopexit, label %for.body 258 259for.end.loopexit: ; preds = %for.body 260 %mul.r = fadd <4 x float> %lane_r, %add 261 %mul.i = fadd <4 x float> %lane_i, %add 262 ret void 263} 264 265declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) 266