xref: /llvm-project/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll (revision e16f2f5d2491fde19afb63d5cec83625d391be30)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
3
4target triple = "aarch64"
5
6%"class.std::complex" = type { { double, double } }
7
8; Zero initialized reduction
9;
10;   complex<double> x = 0.0 + 0.0i;
11;   for (int i = 0; i < 100; ++i)
12;       x += a[i] * b[i];
13;
14define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
15; CHECK-LABEL: complex_mul_v2f64:
16; CHECK:       // %bb.0: // %entry
17; CHECK-NEXT:    mov z1.d, #0 // =0x0
18; CHECK-NEXT:    cntd x8
19; CHECK-NEXT:    mov w10, #100 // =0x64
20; CHECK-NEXT:    neg x9, x8
21; CHECK-NEXT:    ptrue p0.d
22; CHECK-NEXT:    and x9, x9, x10
23; CHECK-NEXT:    rdvl x10, #2
24; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
25; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
26; CHECK-NEXT:  .LBB0_1: // %vector.body
27; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
28; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
29; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0]
30; CHECK-NEXT:    subs x9, x9, x8
31; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x1, #1, mul vl]
32; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x1]
33; CHECK-NEXT:    add x1, x1, x10
34; CHECK-NEXT:    add x0, x0, x10
35; CHECK-NEXT:    fcmla z1.d, p0/m, z5.d, z3.d, #0
36; CHECK-NEXT:    fcmla z0.d, p0/m, z4.d, z2.d, #0
37; CHECK-NEXT:    fcmla z1.d, p0/m, z5.d, z3.d, #90
38; CHECK-NEXT:    fcmla z0.d, p0/m, z4.d, z2.d, #90
39; CHECK-NEXT:    b.ne .LBB0_1
40; CHECK-NEXT:  // %bb.2: // %exit.block
41; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
42; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
43; CHECK-NEXT:    faddv d0, p0, z2.d
44; CHECK-NEXT:    faddv d1, p0, z1.d
45; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
46; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
47; CHECK-NEXT:    ret
48entry:
49  %0 = tail call i64 @llvm.vscale.i64()
50  %1 = shl nuw nsw i64 %0, 1
51  %n.mod.vf = urem i64 100, %1
52  %n.vec = sub nuw nsw i64 100, %n.mod.vf
53  %2 = shl nuw nsw i64 %0, 5
54  br label %vector.body
55
56vector.body:                                      ; preds = %vector.body, %entry
57  %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ]
58  %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ]
59  %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ]
60  %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %14, %vector.body ]
61  %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
62  %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
63  %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
64  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
65  %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
66  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
67  %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
68  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
69  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
70  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
71  %9 = fmul fast <vscale x 2 x double> %8, %4
72  %10 = fmul fast <vscale x 2 x double> %7, %5
73  %11 = fmul fast <vscale x 2 x double> %7, %4
74  %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12
75  %13 = fmul fast <vscale x 2 x double> %8, %5
76  %14 = fsub fast <vscale x 2 x double> %12, %13
77  %15 = fadd fast <vscale x 2 x double> %10, %vec.phi
78  %16 = fadd fast <vscale x 2 x double> %15, %9
79  %lsr.iv.next28 = add i64 %lsr.iv27, %2
80  %lsr.iv.next32 = sub i64 %lsr.iv31, %1
81  %17 = icmp eq i64 %lsr.iv.next32, 0
82  br i1 %17, label %exit.block, label %vector.body
83
84exit.block:                                     ; preds = %vector.body
85  %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14)
86  %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16)
87  %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0
88  %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1
89  ret %"class.std::complex" %.fca.0.1.insert
90}
91
92; Fixed value initialized reduction
93;
94;   complex<double> x = 2.0 + 1.0i;
95;   for (int i = 0; i < 100; ++i)
96;       x += a[i] * b[i];
97;
98define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
99; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
100; CHECK:       // %bb.0: // %entry
101; CHECK-NEXT:    fmov d0, #1.00000000
102; CHECK-NEXT:    mov z1.d, #0 // =0x0
103; CHECK-NEXT:    cntd x8
104; CHECK-NEXT:    fmov d2, #2.00000000
105; CHECK-NEXT:    ptrue p0.d, vl1
106; CHECK-NEXT:    neg x9, x8
107; CHECK-NEXT:    mov w10, #100 // =0x64
108; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
109; CHECK-NEXT:    and x9, x9, x10
110; CHECK-NEXT:    rdvl x10, #2
111; CHECK-NEXT:    mov z1.d, p0/m, z2.d
112; CHECK-NEXT:    ptrue p0.d
113; CHECK-NEXT:    zip2 z0.d, z1.d, z3.d
114; CHECK-NEXT:    zip1 z1.d, z1.d, z3.d
115; CHECK-NEXT:  .LBB1_1: // %vector.body
116; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
117; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
118; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0]
119; CHECK-NEXT:    subs x9, x9, x8
120; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x1, #1, mul vl]
121; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x1]
122; CHECK-NEXT:    add x1, x1, x10
123; CHECK-NEXT:    add x0, x0, x10
124; CHECK-NEXT:    fcmla z1.d, p0/m, z5.d, z3.d, #0
125; CHECK-NEXT:    fcmla z0.d, p0/m, z4.d, z2.d, #0
126; CHECK-NEXT:    fcmla z1.d, p0/m, z5.d, z3.d, #90
127; CHECK-NEXT:    fcmla z0.d, p0/m, z4.d, z2.d, #90
128; CHECK-NEXT:    b.ne .LBB1_1
129; CHECK-NEXT:  // %bb.2: // %exit.block
130; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
131; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
132; CHECK-NEXT:    faddv d0, p0, z2.d
133; CHECK-NEXT:    faddv d1, p0, z1.d
134; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
135; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
136; CHECK-NEXT:    ret
137entry:
138  %0 = tail call i64 @llvm.vscale.i64()
139  %1 = shl nuw nsw i64 %0, 1
140  %n.mod.vf = urem i64 100, %1
141  %n.vec = sub nuw nsw i64 100, %n.mod.vf
142  %2 = shl nuw nsw i64 %0, 5
143  br label %vector.body
144
145vector.body:                                      ; preds = %vector.body, %entry
146  %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ]
147  %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ]
148  %vec.phi = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 1.000000e+00, i32 0), %entry ], [ %16, %vector.body ]
149  %vec.phi12 = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 2.000000e+0, i32 0), %entry ], [ %14, %vector.body ]
150  %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
151  %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
152  %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
153  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
154  %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
155  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
156  %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
157  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
158  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
159  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
160  %9 = fmul fast <vscale x 2 x double> %8, %4
161  %10 = fmul fast <vscale x 2 x double> %7, %5
162  %11 = fmul fast <vscale x 2 x double> %7, %4
163  %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12
164  %13 = fmul fast <vscale x 2 x double> %8, %5
165  %14 = fsub fast <vscale x 2 x double> %12, %13
166  %15 = fadd fast <vscale x 2 x double> %10, %vec.phi
167  %16 = fadd fast <vscale x 2 x double> %15, %9
168  %lsr.iv.next28 = add i64 %lsr.iv27, %2
169  %lsr.iv.next32 = sub i64 %lsr.iv31, %1
170  %17 = icmp eq i64 %lsr.iv.next32, 0
171  br i1 %17, label %exit.block, label %vector.body
172
173exit.block:                                     ; preds = %vector.body
174  %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14)
175  %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16)
176  %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0
177  %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1
178  ret %"class.std::complex" %.fca.0.1.insert
179}
180
181; Loop unrolled with factor 2
182;
183define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
184; CHECK-LABEL: complex_mul_v2f64_unrolled:
185; CHECK:       // %bb.0: // %entry
186; CHECK-NEXT:    mov z1.d, #0 // =0x0
187; CHECK-NEXT:    cntw x8
188; CHECK-NEXT:    mov w10, #1000 // =0x3e8
189; CHECK-NEXT:    neg x9, x8
190; CHECK-NEXT:    ptrue p0.d
191; CHECK-NEXT:    and x9, x9, x10
192; CHECK-NEXT:    rdvl x10, #4
193; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
194; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
195; CHECK-NEXT:    mov z2.d, z1.d
196; CHECK-NEXT:    mov z3.d, z0.d
197; CHECK-NEXT:  .LBB2_1: // %vector.body
198; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
199; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #1, mul vl]
200; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0]
201; CHECK-NEXT:    subs x9, x9, x8
202; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x0, #3, mul vl]
203; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x1, #1, mul vl]
204; CHECK-NEXT:    ld1d { z16.d }, p0/z, [x1]
205; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x0, #2, mul vl]
206; CHECK-NEXT:    add x0, x0, x10
207; CHECK-NEXT:    ld1d { z18.d }, p0/z, [x1, #3, mul vl]
208; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x1, #2, mul vl]
209; CHECK-NEXT:    add x1, x1, x10
210; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z5.d, #0
211; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #0
212; CHECK-NEXT:    fcmla z3.d, p0/m, z18.d, z6.d, #0
213; CHECK-NEXT:    fcmla z2.d, p0/m, z19.d, z17.d, #0
214; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z5.d, #90
215; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #90
216; CHECK-NEXT:    fcmla z3.d, p0/m, z18.d, z6.d, #90
217; CHECK-NEXT:    fcmla z2.d, p0/m, z19.d, z17.d, #90
218; CHECK-NEXT:    b.ne .LBB2_1
219; CHECK-NEXT:  // %bb.2: // %exit.block
220; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
221; CHECK-NEXT:    uzp1 z5.d, z1.d, z0.d
222; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
223; CHECK-NEXT:    uzp2 z0.d, z1.d, z0.d
224; CHECK-NEXT:    fadd z1.d, z4.d, z5.d
225; CHECK-NEXT:    fadd z2.d, z2.d, z0.d
226; CHECK-NEXT:    faddv d0, p0, z1.d
227; CHECK-NEXT:    faddv d1, p0, z2.d
228; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
229; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
230; CHECK-NEXT:    ret
231entry:
232  %0 = tail call i64 @llvm.vscale.i64()
233  %1 = shl nuw nsw i64 %0, 2
234  %n.mod.vf = urem i64 1000, %1
235  %n.vec = sub i64 1000, %n.mod.vf
236  %2 = shl nuw nsw i64 %0, 6
237  %3 = shl nuw nsw i64 %0, 5
238  %scevgep61 = getelementptr i8, ptr %b, i64 %3
239  %scevgep63 = getelementptr i8, ptr %a, i64 %3
240  br label %vector.body
241
242vector.body:                                      ; preds = %vector.body, %entry
243  %lsr.iv38 = phi i64 [ %lsr.iv.next39, %vector.body ], [ %n.vec, %entry ]
244  %lsr.iv34 = phi i64 [ %lsr.iv.next35, %vector.body ], [ 0, %entry ]
245  %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %30, %vector.body ]
246  %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %31, %vector.body ]
247  %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %26, %vector.body ]
248  %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %27, %vector.body ]
249  %scevgep57 = getelementptr i8, ptr %a, i64 %lsr.iv34
250  %scevgep64 = getelementptr i8, ptr %scevgep63, i64 %lsr.iv34
251  %scevgep58 = getelementptr i8, ptr %b, i64 %lsr.iv34
252  %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34
253  %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8
254  %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8
255  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
256  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
257  %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
258  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0
259  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
260  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1
261  %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8
262  %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8
263  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
264  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
265  %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0
266  %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
267  %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1
268  %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
269  %16 = fmul fast <vscale x 2 x double> %14, %6
270  %17 = fmul fast <vscale x 2 x double> %15, %7
271  %18 = fmul fast <vscale x 2 x double> %12, %8
272  %19 = fmul fast <vscale x 2 x double> %13, %9
273  %20 = fmul fast <vscale x 2 x double> %12, %6
274  %21 = fmul fast <vscale x 2 x double> %13, %7
275  %22 = fadd fast <vscale x 2 x double> %20, %vec.phi13
276  %23 = fadd fast <vscale x 2 x double> %21, %vec.phi14
277  %24 = fmul fast <vscale x 2 x double> %14, %8
278  %25 = fmul fast <vscale x 2 x double> %15, %9
279  %26 = fsub fast <vscale x 2 x double> %22, %24
280  %27 = fsub fast <vscale x 2 x double> %23, %25
281  %28 = fadd fast <vscale x 2 x double> %18, %vec.phi
282  %29 = fadd fast <vscale x 2 x double> %19, %vec.phi12
283  %30 = fadd fast <vscale x 2 x double> %28, %16
284  %31 = fadd fast <vscale x 2 x double> %29, %17
285  %lsr.iv.next35 = add i64 %lsr.iv34, %2
286  %lsr.iv.next39 = sub i64 %lsr.iv38, %1
287  %32 = icmp eq i64 %lsr.iv.next39, 0
288  br i1 %32, label %exit.block, label %vector.body
289
290exit.block:                                     ; preds = %vector.body
291  %bin.rdx15 = fadd fast <vscale x 2 x double> %27, %26
292  %33 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx15)
293  %bin.rdx = fadd fast <vscale x 2 x double> %31, %30
294  %34 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx)
295  %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %33, 0, 0
296  %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %34, 0, 1
297  ret %"class.std::complex" %.fca.0.1.insert
298}
299
300; Integer and floating point complex number reduction in the same loop:
301;   complex<double> *s = ...;
302;   int *a = ...;
303;
304;   for (int i = 0; i < N; ++i) {
305;     sum += s[i];
306;     int_sum += a[i];
307;   }
308;
309define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 {
310; CHECK-LABEL: reduction_mix:
311; CHECK:       // %bb.0: // %entry
312; CHECK-NEXT:    mov z2.d, #0 // =0x0
313; CHECK-NEXT:    cntd x9
314; CHECK-NEXT:    mov w11, #100 // =0x64
315; CHECK-NEXT:    neg x10, x9
316; CHECK-NEXT:    ptrue p0.d
317; CHECK-NEXT:    mov x8, xzr
318; CHECK-NEXT:    and x10, x10, x11
319; CHECK-NEXT:    rdvl x11, #2
320; CHECK-NEXT:    zip2 z0.d, z2.d, z2.d
321; CHECK-NEXT:    zip1 z1.d, z2.d, z2.d
322; CHECK-NEXT:  .LBB3_1: // %vector.body
323; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
324; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0]
325; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #1, mul vl]
326; CHECK-NEXT:    add x0, x0, x11
327; CHECK-NEXT:    ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
328; CHECK-NEXT:    add x8, x8, x9
329; CHECK-NEXT:    cmp x10, x8
330; CHECK-NEXT:    fadd z0.d, z4.d, z0.d
331; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
332; CHECK-NEXT:    add z2.d, z5.d, z2.d
333; CHECK-NEXT:    b.ne .LBB3_1
334; CHECK-NEXT:  // %bb.2: // %middle.block
335; CHECK-NEXT:    uaddv d2, p0, z2.d
336; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
337; CHECK-NEXT:    uzp1 z1.d, z1.d, z0.d
338; CHECK-NEXT:    faddv d0, p0, z3.d
339; CHECK-NEXT:    fmov x8, d2
340; CHECK-NEXT:    faddv d1, p0, z1.d
341; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
342; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
343; CHECK-NEXT:    str w8, [x4]
344; CHECK-NEXT:    ret
345entry:
346  %0 = tail call i64 @llvm.vscale.i64()
347  %1 = shl nuw nsw i64 %0, 1
348  %n.mod.vf = urem i64 100, %1
349  %n.vec = sub nuw nsw i64 100, %n.mod.vf
350  %2 = tail call i64 @llvm.vscale.i64()
351  %3 = shl nuw nsw i64 %2, 1
352  br label %vector.body
353
354vector.body:                                      ; preds = %vector.body, %entry
355  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
356  %vec.phi = phi <vscale x 2 x i32> [ zeroinitializer, %entry ], [ %5, %vector.body ]
357  %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %9, %vector.body ]
358  %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %10, %vector.body ]
359  %4 = getelementptr inbounds i32, ptr %s, i64 %index
360  %wide.load = load <vscale x 2 x i32>, ptr %4, align 4
361  %5 = add <vscale x 2 x i32> %wide.load, %vec.phi
362  %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index
363  %wide.vec = load <vscale x 4 x double>, ptr %6, align 8
364  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
365  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
366  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
367  %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13
368  %10 = fadd fast <vscale x 2 x double> %8, %vec.phi14
369  %index.next = add nuw i64 %index, %3
370  %11 = icmp eq i64 %index.next, %n.vec
371  br i1 %11, label %middle.block, label %vector.body
372
373middle.block:                                     ; preds = %vector.body
374  %12 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %10)
375  %13 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %9)
376  %14 = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %5)
377  store i32 %14, ptr %outs, align 4
378  %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %12, 0, 0
379  %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %13, 0, 1
380  ret %"class.std::complex" %.fca.0.1.insert
381}
382
383
384declare i64 @llvm.vscale.i64()
385declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
386declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
387declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
388