xref: /llvm-project/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll (revision e16f2f5d2491fde19afb63d5cec83625d391be30)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
3
4target triple = "aarch64"
5
6%"struct.std::complex" = type { { double, double } }
7
8; Zero initialized reduction
9;
10;   complex<double> x = 0.0 + 0.0i;
11;   for (int i = 0; i < 100; ++i)
12;       x += a[i] * b[i];
13;
14define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
15; CHECK-LABEL: complex_mul_v2f64:
16; CHECK:       // %bb.0: // %entry
17; CHECK-NEXT:    movi v0.2d, #0000000000000000
18; CHECK-NEXT:    movi v1.2d, #0000000000000000
19; CHECK-NEXT:    mov x8, xzr
20; CHECK-NEXT:  .LBB0_1: // %vector.body
21; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
22; CHECK-NEXT:    add x9, x0, x8
23; CHECK-NEXT:    add x10, x1, x8
24; CHECK-NEXT:    add x8, x8, #32
25; CHECK-NEXT:    ldp q3, q2, [x9]
26; CHECK-NEXT:    cmp x8, #1600
27; CHECK-NEXT:    ldp q5, q4, [x10]
28; CHECK-NEXT:    fcmla v0.2d, v5.2d, v3.2d, #0
29; CHECK-NEXT:    fcmla v1.2d, v4.2d, v2.2d, #0
30; CHECK-NEXT:    fcmla v0.2d, v5.2d, v3.2d, #90
31; CHECK-NEXT:    fcmla v1.2d, v4.2d, v2.2d, #90
32; CHECK-NEXT:    b.ne .LBB0_1
33; CHECK-NEXT:  // %bb.2: // %middle.block
34; CHECK-NEXT:    zip2 v2.2d, v0.2d, v1.2d
35; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
36; CHECK-NEXT:    faddp d0, v0.2d
37; CHECK-NEXT:    faddp d1, v2.2d
38; CHECK-NEXT:    ret
39entry:
40  br label %vector.body
41
42vector.body:                                      ; preds = %vector.body, %entry
43  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
44  %vec.phi = phi <2 x double> [ zeroinitializer, %entry ], [ %7, %vector.body ]
45  %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %5, %vector.body ]
46  %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
47  %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
48  %wide.vec = load <4 x double>, ptr %scevgep, align 8
49  %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
50  %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
51  %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
52  %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
53  %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
54  %0 = fmul fast <2 x double> %strided.vec31, %strided.vec
55  %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
56  %2 = fmul fast <2 x double> %strided.vec30, %strided.vec
57  %3 = fadd fast <2 x double> %2, %vec.phi27
58  %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
59  %5 = fsub fast <2 x double> %3, %4
60  %6 = fadd fast <2 x double> %1, %vec.phi
61  %7 = fadd fast <2 x double> %6, %0
62  %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
63  %8 = icmp eq i64 %lsr.iv.next, 1600
64  br i1 %8, label %middle.block, label %vector.body
65
66middle.block:                                     ; preds = %vector.body
67  %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
68  %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
69  %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
70  %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
71  ret %"struct.std::complex" %.fca.0.1.insert
72}
73
74; Fixed value initialized reduction
75;
76;   complex<double> x = 2.0 + 1.0i;
77;   for (int i = 0; i < 100; ++i)
78;       x += a[i] * b[i];
79;
80define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
81; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
82; CHECK:       // %bb.0: // %entry
83; CHECK-NEXT:    movi v0.2d, #0000000000000000
84; CHECK-NEXT:    adrp x8, .LCPI1_0
85; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
86; CHECK-NEXT:    mov x8, xzr
87; CHECK-NEXT:  .LBB1_1: // %vector.body
88; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
89; CHECK-NEXT:    add x9, x0, x8
90; CHECK-NEXT:    add x10, x1, x8
91; CHECK-NEXT:    add x8, x8, #32
92; CHECK-NEXT:    ldp q3, q2, [x9]
93; CHECK-NEXT:    cmp x8, #1600
94; CHECK-NEXT:    ldp q5, q4, [x10]
95; CHECK-NEXT:    fcmla v1.2d, v5.2d, v3.2d, #0
96; CHECK-NEXT:    fcmla v0.2d, v4.2d, v2.2d, #0
97; CHECK-NEXT:    fcmla v1.2d, v5.2d, v3.2d, #90
98; CHECK-NEXT:    fcmla v0.2d, v4.2d, v2.2d, #90
99; CHECK-NEXT:    b.ne .LBB1_1
100; CHECK-NEXT:  // %bb.2: // %middle.block
101; CHECK-NEXT:    zip2 v2.2d, v1.2d, v0.2d
102; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
103; CHECK-NEXT:    faddp d0, v0.2d
104; CHECK-NEXT:    faddp d1, v2.2d
105; CHECK-NEXT:    ret
106entry:
107  br label %vector.body
108
109vector.body:                                      ; preds = %vector.body, %entry
110  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
111  %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %7, %vector.body ]
112  %vec.phi27 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %5, %vector.body ]
113  %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
114  %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
115  %wide.vec = load <4 x double>, ptr %scevgep, align 8
116  %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
117  %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
118  %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
119  %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
120  %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
121  %0 = fmul fast <2 x double> %strided.vec31, %strided.vec
122  %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
123  %2 = fmul fast <2 x double> %strided.vec30, %strided.vec
124  %3 = fadd fast <2 x double> %2, %vec.phi27
125  %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
126  %5 = fsub fast <2 x double> %3, %4
127  %6 = fadd fast <2 x double> %1, %vec.phi
128  %7 = fadd fast <2 x double> %6, %0
129  %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
130  %8 = icmp eq i64 %lsr.iv.next, 1600
131  br i1 %8, label %middle.block, label %vector.body
132
133middle.block:                                     ; preds = %vector.body
134  %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
135  %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
136  %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
137  %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
138  ret %"struct.std::complex" %.fca.0.1.insert
139}
140
141; Loop unrolled with factor 2
142;
143define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
144; CHECK-LABEL: complex_mul_v2f64_unrolled:
145; CHECK:       // %bb.0: // %entry
146; CHECK-NEXT:    movi v0.2d, #0000000000000000
147; CHECK-NEXT:    movi v1.2d, #0000000000000000
148; CHECK-NEXT:    adrp x8, .LCPI2_0
149; CHECK-NEXT:    movi v3.2d, #0000000000000000
150; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
151; CHECK-NEXT:    add x8, x0, #32
152; CHECK-NEXT:    add x9, x1, #32
153; CHECK-NEXT:    mov x10, #-100 // =0xffffffffffffff9c
154; CHECK-NEXT:  .LBB2_1: // %vector.body
155; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
156; CHECK-NEXT:    ldp q5, q4, [x8, #-32]
157; CHECK-NEXT:    adds x10, x10, #4
158; CHECK-NEXT:    ldp q7, q6, [x9, #-32]
159; CHECK-NEXT:    ldp q17, q16, [x8], #64
160; CHECK-NEXT:    ldp q19, q18, [x9], #64
161; CHECK-NEXT:    fcmla v2.2d, v7.2d, v5.2d, #0
162; CHECK-NEXT:    fcmla v0.2d, v6.2d, v4.2d, #0
163; CHECK-NEXT:    fcmla v1.2d, v19.2d, v17.2d, #0
164; CHECK-NEXT:    fcmla v3.2d, v18.2d, v16.2d, #0
165; CHECK-NEXT:    fcmla v2.2d, v7.2d, v5.2d, #90
166; CHECK-NEXT:    fcmla v0.2d, v6.2d, v4.2d, #90
167; CHECK-NEXT:    fcmla v1.2d, v19.2d, v17.2d, #90
168; CHECK-NEXT:    fcmla v3.2d, v18.2d, v16.2d, #90
169; CHECK-NEXT:    b.ne .LBB2_1
170; CHECK-NEXT:  // %bb.2: // %middle.block
171; CHECK-NEXT:    zip2 v4.2d, v1.2d, v3.2d
172; CHECK-NEXT:    zip1 v1.2d, v1.2d, v3.2d
173; CHECK-NEXT:    zip2 v3.2d, v2.2d, v0.2d
174; CHECK-NEXT:    zip1 v0.2d, v2.2d, v0.2d
175; CHECK-NEXT:    fadd v0.2d, v1.2d, v0.2d
176; CHECK-NEXT:    fadd v1.2d, v4.2d, v3.2d
177; CHECK-NEXT:    faddp d0, v0.2d
178; CHECK-NEXT:    faddp d1, v1.2d
179; CHECK-NEXT:    ret
180entry:
181  %scevgep = getelementptr i8, ptr %a, i64 32
182  %scevgep49 = getelementptr i8, ptr %b, i64 32
183  br label %vector.body
184
185vector.body:                                      ; preds = %vector.body, %entry
186  %lsr.iv54 = phi i64 [ %lsr.iv.next, %vector.body ], [ 100, %entry ]
187  %lsr.iv50 = phi ptr [ %scevgep51, %vector.body ], [ %scevgep49, %entry ]
188  %lsr.iv = phi ptr [ %scevgep48, %vector.body ], [ %scevgep, %entry ]
189  %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %14, %vector.body ]
190  %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %15, %vector.body ]
191  %vec.phi28 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %10, %vector.body ]
192  %vec.phi29 = phi <2 x double> [ zeroinitializer, %entry ], [ %11, %vector.body ]
193  %scevgep52 = getelementptr i8, ptr %lsr.iv, i64 -32
194  %scevgep53 = getelementptr i8, ptr %lsr.iv50, i64 -32
195  %wide.vec = load <4 x double>, ptr %scevgep52, align 8
196  %wide.vec30 = load <4 x double>, ptr %lsr.iv, align 8
197  %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
198  %strided.vec31 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 0, i32 2>
199  %strided.vec32 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
200  %strided.vec33 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 1, i32 3>
201  %wide.vec34 = load <4 x double>, ptr %scevgep53, align 8
202  %wide.vec35 = load <4 x double>, ptr %lsr.iv50, align 8
203  %strided.vec36 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 0, i32 2>
204  %strided.vec37 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 0, i32 2>
205  %strided.vec38 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 1, i32 3>
206  %strided.vec39 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 1, i32 3>
207  %0 = fmul fast <2 x double> %strided.vec38, %strided.vec
208  %1 = fmul fast <2 x double> %strided.vec39, %strided.vec31
209  %2 = fmul fast <2 x double> %strided.vec36, %strided.vec32
210  %3 = fmul fast <2 x double> %strided.vec37, %strided.vec33
211  %4 = fmul fast <2 x double> %strided.vec36, %strided.vec
212  %5 = fmul fast <2 x double> %strided.vec37, %strided.vec31
213  %6 = fadd fast <2 x double> %4, %vec.phi28
214  %7 = fadd fast <2 x double> %5, %vec.phi29
215  %8 = fmul fast <2 x double> %strided.vec38, %strided.vec32
216  %9 = fmul fast <2 x double> %strided.vec39, %strided.vec33
217  %10 = fsub fast <2 x double> %6, %8
218  %11 = fsub fast <2 x double> %7, %9
219  %12 = fadd fast <2 x double> %2, %vec.phi
220  %13 = fadd fast <2 x double> %3, %vec.phi27
221  %14 = fadd fast <2 x double> %12, %0
222  %15 = fadd fast <2 x double> %13, %1
223  %scevgep48 = getelementptr i8, ptr %lsr.iv, i64 64
224  %scevgep51 = getelementptr i8, ptr %lsr.iv50, i64 64
225  %lsr.iv.next = add nsw i64 %lsr.iv54, -4
226  %16 = icmp eq i64 %lsr.iv.next, 0
227  br i1 %16, label %middle.block, label %vector.body
228
229middle.block:                                     ; preds = %vector.body
230  %bin.rdx40 = fadd fast <2 x double> %11, %10
231  %17 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx40)
232  %bin.rdx = fadd fast <2 x double> %15, %14
233  %18 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx)
234  %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %17, 0, 0
235  %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %18, 0, 1
236  ret %"struct.std::complex" %.fca.0.1.insert
237}
238
239; The reduced bug from D153355. Shows that reduction was detected where it did not exist.
240define void @incorrect_reduction_pattern(i1 %exitcond.not) {
241; CHECK-LABEL: incorrect_reduction_pattern:
242; CHECK:       // %bb.0: // %entry
243; CHECK-NEXT:  .LBB3_1: // %for.body
244; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
245; CHECK-NEXT:    tbz w0, #0, .LBB3_1
246; CHECK-NEXT:  // %bb.2: // %for.end.loopexit
247; CHECK-NEXT:    ret
248entry:
249  br label %for.body
250
251for.body:                                         ; preds = %for.body, %entry
252  %vec_r = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_r, %for.body ]
253  %vec_i = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_i, %for.body ]
254  %add = fadd <4 x float> %vec_r, %vec_i
255  %lane_r = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
256  %lane_i = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
257  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
258
259for.end.loopexit:                                 ; preds = %for.body
260  %mul.r = fadd <4 x float> %lane_r, %add
261  %mul.i = fadd <4 x float> %lane_i, %add
262  ret void
263}
264
265declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
266