1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s --mattr=+sve -o - | FileCheck %s 3 4target triple = "aarch64" 5 6%"class.std::complex" = type { { double, double } } 7 8; Zero initialized reduction 9; 10; complex<double> x = 0.0 + 0.0i; 11; for (int i = 0; i < 100; ++i) 12; x += a[i] * b[i]; 13; 14define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { 15; CHECK-LABEL: complex_mul_v2f64: 16; CHECK: // %bb.0: // %entry 17; CHECK-NEXT: mov z1.d, #0 // =0x0 18; CHECK-NEXT: cntd x8 19; CHECK-NEXT: mov w10, #100 // =0x64 20; CHECK-NEXT: neg x9, x8 21; CHECK-NEXT: ptrue p0.d 22; CHECK-NEXT: and x9, x9, x10 23; CHECK-NEXT: rdvl x10, #2 24; CHECK-NEXT: zip2 z0.d, z1.d, z1.d 25; CHECK-NEXT: zip1 z1.d, z1.d, z1.d 26; CHECK-NEXT: .LBB0_1: // %vector.body 27; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 28; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] 29; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] 30; CHECK-NEXT: subs x9, x9, x8 31; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] 32; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] 33; CHECK-NEXT: add x1, x1, x10 34; CHECK-NEXT: add x0, x0, x10 35; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 36; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 37; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 38; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 39; CHECK-NEXT: b.ne .LBB0_1 40; CHECK-NEXT: // %bb.2: // %exit.block 41; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d 42; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d 43; CHECK-NEXT: faddv d0, p0, z2.d 44; CHECK-NEXT: faddv d1, p0, z1.d 45; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 46; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 47; CHECK-NEXT: ret 48entry: 49 %0 = tail call i64 @llvm.vscale.i64() 50 %1 = shl nuw nsw i64 %0, 1 51 %n.mod.vf = urem i64 100, %1 52 %n.vec = sub nuw nsw i64 100, %n.mod.vf 53 %2 = shl nuw nsw i64 %0, 5 54 br label %vector.body 55 56vector.body: ; preds = %vector.body, %entry 57 %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ] 58 %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ] 59 %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ] 60 %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %14, %vector.body ] 61 %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27 62 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27 63 %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8 64 %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec) 65 %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0 66 %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1 67 %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8 68 %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30) 69 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0 70 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1 71 %9 = fmul fast <vscale x 2 x double> %8, %4 72 %10 = fmul fast <vscale x 2 x double> %7, %5 73 %11 = fmul fast <vscale x 2 x double> %7, %4 74 %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12 75 %13 = fmul fast <vscale x 2 x double> %8, %5 76 %14 = fsub fast <vscale x 2 x double> %12, %13 77 %15 = fadd fast <vscale x 2 x double> %10, %vec.phi 78 %16 = fadd fast <vscale x 2 x double> %15, %9 79 %lsr.iv.next28 = add i64 %lsr.iv27, %2 80 %lsr.iv.next32 = sub i64 %lsr.iv31, %1 81 %17 = icmp eq i64 %lsr.iv.next32, 0 82 br i1 %17, label %exit.block, label %vector.body 83 84exit.block: ; preds = %vector.body 85 %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14) 86 %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16) 87 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0 88 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1 89 ret %"class.std::complex" %.fca.0.1.insert 90} 91 92; Fixed value initialized reduction 93; 94; complex<double> x = 2.0 + 1.0i; 95; for (int i = 0; i < 100; ++i) 96; x += a[i] * b[i]; 97; 98define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { 99; CHECK-LABEL: complex_mul_nonzero_init_v2f64: 100; CHECK: // %bb.0: // %entry 101; CHECK-NEXT: fmov d0, #1.00000000 102; CHECK-NEXT: mov z1.d, #0 // =0x0 103; CHECK-NEXT: cntd x8 104; CHECK-NEXT: fmov d2, #2.00000000 105; CHECK-NEXT: ptrue p0.d, vl1 106; CHECK-NEXT: neg x9, x8 107; CHECK-NEXT: mov w10, #100 // =0x64 108; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d 109; CHECK-NEXT: and x9, x9, x10 110; CHECK-NEXT: rdvl x10, #2 111; CHECK-NEXT: mov z1.d, p0/m, z2.d 112; CHECK-NEXT: ptrue p0.d 113; CHECK-NEXT: zip2 z0.d, z1.d, z3.d 114; CHECK-NEXT: zip1 z1.d, z1.d, z3.d 115; CHECK-NEXT: .LBB1_1: // %vector.body 116; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 117; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] 118; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] 119; CHECK-NEXT: subs x9, x9, x8 120; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] 121; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] 122; CHECK-NEXT: add x1, x1, x10 123; CHECK-NEXT: add x0, x0, x10 124; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 125; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 126; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 127; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 128; CHECK-NEXT: b.ne .LBB1_1 129; CHECK-NEXT: // %bb.2: // %exit.block 130; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d 131; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d 132; CHECK-NEXT: faddv d0, p0, z2.d 133; CHECK-NEXT: faddv d1, p0, z1.d 134; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 135; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 136; CHECK-NEXT: ret 137entry: 138 %0 = tail call i64 @llvm.vscale.i64() 139 %1 = shl nuw nsw i64 %0, 1 140 %n.mod.vf = urem i64 100, %1 141 %n.vec = sub nuw nsw i64 100, %n.mod.vf 142 %2 = shl nuw nsw i64 %0, 5 143 br label %vector.body 144 145vector.body: ; preds = %vector.body, %entry 146 %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ] 147 %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ] 148 %vec.phi = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 1.000000e+00, i32 0), %entry ], [ %16, %vector.body ] 149 %vec.phi12 = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 2.000000e+0, i32 0), %entry ], [ %14, %vector.body ] 150 %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27 151 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27 152 %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8 153 %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec) 154 %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0 155 %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1 156 %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8 157 %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30) 158 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0 159 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1 160 %9 = fmul fast <vscale x 2 x double> %8, %4 161 %10 = fmul fast <vscale x 2 x double> %7, %5 162 %11 = fmul fast <vscale x 2 x double> %7, %4 163 %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12 164 %13 = fmul fast <vscale x 2 x double> %8, %5 165 %14 = fsub fast <vscale x 2 x double> %12, %13 166 %15 = fadd fast <vscale x 2 x double> %10, %vec.phi 167 %16 = fadd fast <vscale x 2 x double> %15, %9 168 %lsr.iv.next28 = add i64 %lsr.iv27, %2 169 %lsr.iv.next32 = sub i64 %lsr.iv31, %1 170 %17 = icmp eq i64 %lsr.iv.next32, 0 171 br i1 %17, label %exit.block, label %vector.body 172 173exit.block: ; preds = %vector.body 174 %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14) 175 %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16) 176 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0 177 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1 178 ret %"class.std::complex" %.fca.0.1.insert 179} 180 181; Loop unrolled with factor 2 182; 183define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { 184; CHECK-LABEL: complex_mul_v2f64_unrolled: 185; CHECK: // %bb.0: // %entry 186; CHECK-NEXT: mov z1.d, #0 // =0x0 187; CHECK-NEXT: cntw x8 188; CHECK-NEXT: mov w10, #1000 // =0x3e8 189; CHECK-NEXT: neg x9, x8 190; CHECK-NEXT: ptrue p0.d 191; CHECK-NEXT: and x9, x9, x10 192; CHECK-NEXT: rdvl x10, #4 193; CHECK-NEXT: zip2 z0.d, z1.d, z1.d 194; CHECK-NEXT: zip1 z1.d, z1.d, z1.d 195; CHECK-NEXT: mov z2.d, z1.d 196; CHECK-NEXT: mov z3.d, z0.d 197; CHECK-NEXT: .LBB2_1: // %vector.body 198; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 199; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] 200; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0] 201; CHECK-NEXT: subs x9, x9, x8 202; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl] 203; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl] 204; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1] 205; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl] 206; CHECK-NEXT: add x0, x0, x10 207; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl] 208; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl] 209; CHECK-NEXT: add x1, x1, x10 210; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 211; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 212; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0 213; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0 214; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90 215; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 216; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90 217; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90 218; CHECK-NEXT: b.ne .LBB2_1 219; CHECK-NEXT: // %bb.2: // %exit.block 220; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d 221; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d 222; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d 223; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d 224; CHECK-NEXT: fadd z1.d, z4.d, z5.d 225; CHECK-NEXT: fadd z2.d, z2.d, z0.d 226; CHECK-NEXT: faddv d0, p0, z1.d 227; CHECK-NEXT: faddv d1, p0, z2.d 228; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 229; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 230; CHECK-NEXT: ret 231entry: 232 %0 = tail call i64 @llvm.vscale.i64() 233 %1 = shl nuw nsw i64 %0, 2 234 %n.mod.vf = urem i64 1000, %1 235 %n.vec = sub i64 1000, %n.mod.vf 236 %2 = shl nuw nsw i64 %0, 6 237 %3 = shl nuw nsw i64 %0, 5 238 %scevgep61 = getelementptr i8, ptr %b, i64 %3 239 %scevgep63 = getelementptr i8, ptr %a, i64 %3 240 br label %vector.body 241 242vector.body: ; preds = %vector.body, %entry 243 %lsr.iv38 = phi i64 [ %lsr.iv.next39, %vector.body ], [ %n.vec, %entry ] 244 %lsr.iv34 = phi i64 [ %lsr.iv.next35, %vector.body ], [ 0, %entry ] 245 %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %30, %vector.body ] 246 %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %31, %vector.body ] 247 %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %26, %vector.body ] 248 %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %27, %vector.body ] 249 %scevgep57 = getelementptr i8, ptr %a, i64 %lsr.iv34 250 %scevgep64 = getelementptr i8, ptr %scevgep63, i64 %lsr.iv34 251 %scevgep58 = getelementptr i8, ptr %b, i64 %lsr.iv34 252 %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34 253 %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8 254 %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8 255 %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec) 256 %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32) 257 %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0 258 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0 259 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1 260 %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1 261 %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8 262 %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8 263 %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34) 264 %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35) 265 %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0 266 %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0 267 %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1 268 %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 1 269 %16 = fmul fast <vscale x 2 x double> %14, %6 270 %17 = fmul fast <vscale x 2 x double> %15, %7 271 %18 = fmul fast <vscale x 2 x double> %12, %8 272 %19 = fmul fast <vscale x 2 x double> %13, %9 273 %20 = fmul fast <vscale x 2 x double> %12, %6 274 %21 = fmul fast <vscale x 2 x double> %13, %7 275 %22 = fadd fast <vscale x 2 x double> %20, %vec.phi13 276 %23 = fadd fast <vscale x 2 x double> %21, %vec.phi14 277 %24 = fmul fast <vscale x 2 x double> %14, %8 278 %25 = fmul fast <vscale x 2 x double> %15, %9 279 %26 = fsub fast <vscale x 2 x double> %22, %24 280 %27 = fsub fast <vscale x 2 x double> %23, %25 281 %28 = fadd fast <vscale x 2 x double> %18, %vec.phi 282 %29 = fadd fast <vscale x 2 x double> %19, %vec.phi12 283 %30 = fadd fast <vscale x 2 x double> %28, %16 284 %31 = fadd fast <vscale x 2 x double> %29, %17 285 %lsr.iv.next35 = add i64 %lsr.iv34, %2 286 %lsr.iv.next39 = sub i64 %lsr.iv38, %1 287 %32 = icmp eq i64 %lsr.iv.next39, 0 288 br i1 %32, label %exit.block, label %vector.body 289 290exit.block: ; preds = %vector.body 291 %bin.rdx15 = fadd fast <vscale x 2 x double> %27, %26 292 %33 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx15) 293 %bin.rdx = fadd fast <vscale x 2 x double> %31, %30 294 %34 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx) 295 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %33, 0, 0 296 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %34, 0, 1 297 ret %"class.std::complex" %.fca.0.1.insert 298} 299 300; Integer and floating point complex number reduction in the same loop: 301; complex<double> *s = ...; 302; int *a = ...; 303; 304; for (int i = 0; i < N; ++i) { 305; sum += s[i]; 306; int_sum += a[i]; 307; } 308; 309define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 { 310; CHECK-LABEL: reduction_mix: 311; CHECK: // %bb.0: // %entry 312; CHECK-NEXT: mov z2.d, #0 // =0x0 313; CHECK-NEXT: cntd x9 314; CHECK-NEXT: mov w11, #100 // =0x64 315; CHECK-NEXT: neg x10, x9 316; CHECK-NEXT: ptrue p0.d 317; CHECK-NEXT: mov x8, xzr 318; CHECK-NEXT: and x10, x10, x11 319; CHECK-NEXT: rdvl x11, #2 320; CHECK-NEXT: zip2 z0.d, z2.d, z2.d 321; CHECK-NEXT: zip1 z1.d, z2.d, z2.d 322; CHECK-NEXT: .LBB3_1: // %vector.body 323; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 324; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] 325; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] 326; CHECK-NEXT: add x0, x0, x11 327; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] 328; CHECK-NEXT: add x8, x8, x9 329; CHECK-NEXT: cmp x10, x8 330; CHECK-NEXT: fadd z0.d, z4.d, z0.d 331; CHECK-NEXT: fadd z1.d, z3.d, z1.d 332; CHECK-NEXT: add z2.d, z5.d, z2.d 333; CHECK-NEXT: b.ne .LBB3_1 334; CHECK-NEXT: // %bb.2: // %middle.block 335; CHECK-NEXT: uaddv d2, p0, z2.d 336; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d 337; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d 338; CHECK-NEXT: faddv d0, p0, z3.d 339; CHECK-NEXT: fmov x8, d2 340; CHECK-NEXT: faddv d1, p0, z1.d 341; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 342; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 343; CHECK-NEXT: str w8, [x4] 344; CHECK-NEXT: ret 345entry: 346 %0 = tail call i64 @llvm.vscale.i64() 347 %1 = shl nuw nsw i64 %0, 1 348 %n.mod.vf = urem i64 100, %1 349 %n.vec = sub nuw nsw i64 100, %n.mod.vf 350 %2 = tail call i64 @llvm.vscale.i64() 351 %3 = shl nuw nsw i64 %2, 1 352 br label %vector.body 353 354vector.body: ; preds = %vector.body, %entry 355 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 356 %vec.phi = phi <vscale x 2 x i32> [ zeroinitializer, %entry ], [ %5, %vector.body ] 357 %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %9, %vector.body ] 358 %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %10, %vector.body ] 359 %4 = getelementptr inbounds i32, ptr %s, i64 %index 360 %wide.load = load <vscale x 2 x i32>, ptr %4, align 4 361 %5 = add <vscale x 2 x i32> %wide.load, %vec.phi 362 %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index 363 %wide.vec = load <vscale x 4 x double>, ptr %6, align 8 364 %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec) 365 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0 366 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1 367 %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13 368 %10 = fadd fast <vscale x 2 x double> %8, %vec.phi14 369 %index.next = add nuw i64 %index, %3 370 %11 = icmp eq i64 %index.next, %n.vec 371 br i1 %11, label %middle.block, label %vector.body 372 373middle.block: ; preds = %vector.body 374 %12 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %10) 375 %13 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %9) 376 %14 = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %5) 377 store i32 %14, ptr %outs, align 4 378 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %12, 0, 0 379 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %13, 0, 1 380 ret %"class.std::complex" %.fca.0.1.insert 381} 382 383 384declare i64 @llvm.vscale.i64() 385declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>) 386declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>) 387declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>) 388