xref: /llvm-project/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll (revision f6947e479e14e7904aa0b2539a95f5dfdc8f9295)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
3
4target triple = "aarch64"
5; Expected to transform
6;   *p = (a * b);
7;   return (a * b) * a;
8define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b, ptr %p) {
9; CHECK-LABEL: mul_triangle:
10; CHECK:       // %bb.0: // %entry
11; CHECK-NEXT:    movi v3.2d, #0000000000000000
12; CHECK-NEXT:    movi v2.2d, #0000000000000000
13; CHECK-NEXT:    fcmla v3.4s, v0.4s, v1.4s, #0
14; CHECK-NEXT:    fcmla v3.4s, v0.4s, v1.4s, #90
15; CHECK-NEXT:    fcmla v2.4s, v3.4s, v0.4s, #0
16; CHECK-NEXT:    str q3, [x0]
17; CHECK-NEXT:    fcmla v2.4s, v3.4s, v0.4s, #90
18; CHECK-NEXT:    mov v0.16b, v2.16b
19; CHECK-NEXT:    ret
20entry:
21  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
22  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
23  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
24  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
25  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
26  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
27  %2 = fsub fast <2 x float> %0, %1
28  %3 = fmul fast <2 x float> %2, %strided.vec35
29  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
30  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
31  %6 = fadd fast <2 x float> %4, %5
32  %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
33  store <4 x float> %otheruse, ptr %p
34  %7 = fmul fast <2 x float> %6, %strided.vec
35  %8 = fadd fast <2 x float> %3, %7
36  %9 = fmul fast <2 x float> %2, %strided.vec
37  %10 = fmul fast <2 x float> %6, %strided.vec35
38  %11 = fsub fast <2 x float> %9, %10
39  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
40  ret <4 x float> %interleaved.vec
41}
42
43; Expected to not transform. Shows that external use prevents deinterleaving.
44;   *p = (a * b).real();
45;   return (a * b) * a;
46define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, ptr %p) {
47; CHECK-LABEL: mul_triangle_external_use:
48; CHECK:       // %bb.0: // %entry
49; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
50; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
51; CHECK-NEXT:    zip2 v4.2s, v0.2s, v2.2s
52; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
53; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
54; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
55; CHECK-NEXT:    fmul v2.2s, v4.2s, v5.2s
56; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
57; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
58; CHECK-NEXT:    fneg v1.2s, v3.2s
59; CHECK-NEXT:    fmul v3.2s, v2.2s, v4.2s
60; CHECK-NEXT:    str d2, [x0]
61; CHECK-NEXT:    fmla v1.2s, v0.2s, v5.2s
62; CHECK-NEXT:    fmul v5.2s, v2.2s, v0.2s
63; CHECK-NEXT:    fneg v3.2s, v3.2s
64; CHECK-NEXT:    fmla v5.2s, v4.2s, v1.2s
65; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
66; CHECK-NEXT:    zip1 v0.4s, v3.4s, v5.4s
67; CHECK-NEXT:    ret
68entry:
69  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
70  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
71  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
72  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
73  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
74  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
75  %2 = fsub fast <2 x float> %0, %1
76  %3 = fmul fast <2 x float> %2, %strided.vec35
77  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
78  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
79  %6 = fadd fast <2 x float> %4, %5
80  store <2 x float> %6, ptr %p
81  %7 = fmul fast <2 x float> %6, %strided.vec
82  %8 = fadd fast <2 x float> %3, %7
83  %9 = fmul fast <2 x float> %2, %strided.vec
84  %10 = fmul fast <2 x float> %6, %strided.vec35
85  %11 = fsub fast <2 x float> %9, %10
86  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
87  ret <4 x float> %interleaved.vec
88}
89
90; Expected to transform partially (only d * c). Shows that external use of shufflevector does not prevent deinterleaving.
91;   *p1 = (a * b).real();
92;   *p2 = (a * b) * c;
93;   return d * c;
94define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) {
95; CHECK-LABEL: multiple_muls_shuffle_external:
96; CHECK:       // %bb.0: // %entry
97; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
98; CHECK-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
99; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
100; CHECK-NEXT:    zip2 v7.2s, v0.2s, v5.2s
101; CHECK-NEXT:    zip1 v16.2s, v1.2s, v6.2s
102; CHECK-NEXT:    zip2 v1.2s, v1.2s, v6.2s
103; CHECK-NEXT:    zip1 v0.2s, v0.2s, v5.2s
104; CHECK-NEXT:    fmul v5.2s, v16.2s, v7.2s
105; CHECK-NEXT:    fmul v6.2s, v1.2s, v7.2s
106; CHECK-NEXT:    fmla v5.2s, v0.2s, v1.2s
107; CHECK-NEXT:    fneg v1.2s, v6.2s
108; CHECK-NEXT:    zip1 v6.2s, v2.2s, v4.2s
109; CHECK-NEXT:    zip2 v4.2s, v2.2s, v4.2s
110; CHECK-NEXT:    fmla v1.2s, v0.2s, v16.2s
111; CHECK-NEXT:    fmul v17.2s, v6.2s, v5.2s
112; CHECK-NEXT:    movi v0.2d, #0000000000000000
113; CHECK-NEXT:    fmul v5.2s, v4.2s, v5.2s
114; CHECK-NEXT:    fmla v17.2s, v1.2s, v4.2s
115; CHECK-NEXT:    fcmla v0.4s, v2.4s, v3.4s, #0
116; CHECK-NEXT:    str d1, [x0]
117; CHECK-NEXT:    fneg v16.2s, v5.2s
118; CHECK-NEXT:    fcmla v0.4s, v2.4s, v3.4s, #90
119; CHECK-NEXT:    fmla v16.2s, v1.2s, v6.2s
120; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x1]
121; CHECK-NEXT:    ret
122entry:
123  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
124  %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
125  %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
126  %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
127  %0 = fmul fast <2 x float> %strided.vec91, %strided.vec
128  %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88
129  %2 = fadd fast <2 x float> %0, %1
130  %3 = fmul fast <2 x float> %strided.vec90, %strided.vec
131  %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88
132  %5 = fsub fast <2 x float> %3, %4
133  store <2 x float> %5, ptr %p1
134  %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
135  %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
136  %6 = fmul fast <2 x float> %strided.vec94, %5
137  %7 = fmul fast <2 x float> %strided.vec93, %2
138  %8 = fadd fast <2 x float> %6, %7
139  %9 = fmul fast <2 x float> %strided.vec93, %5
140  %10 = fmul fast <2 x float> %strided.vec94, %2
141  %11 = fsub fast <2 x float> %9, %10
142  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
143  store <4 x float> %interleaved.vec, ptr %p2
144  %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
145  %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
146  %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94
147  %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93
148  %14 = fadd fast <2 x float> %13, %12
149  %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93
150  %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94
151  %17 = fsub fast <2 x float> %15, %16
152  %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
153  ret <4 x float> %interleaved.vec98
154}
155
156; Same as above but data are loaded from memory instead of being passes as arguments.
157; Expected to transform partially (only d * c).
158; Shows that ld2 is not generated for `c` although it used by both complex `d * c` and non-complex `(a * b) * c` instruction chains.
159define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_d, ptr %p1, ptr %p2) {
160; CHECK-LABEL: multiple_muls_shuffle_external_with_loads:
161; CHECK:       // %bb.0: // %entry
162; CHECK-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
163; CHECK-NEXT:    ld2 { v2.2s, v3.2s }, [x1]
164; CHECK-NEXT:    fmul v4.2s, v3.2s, v1.2s
165; CHECK-NEXT:    fmul v6.2s, v2.2s, v1.2s
166; CHECK-NEXT:    fneg v4.2s, v4.2s
167; CHECK-NEXT:    fmla v6.2s, v0.2s, v3.2s
168; CHECK-NEXT:    fmla v4.2s, v0.2s, v2.2s
169; CHECK-NEXT:    str d4, [x4]
170; CHECK-NEXT:    ldr q5, [x2]
171; CHECK-NEXT:    ext v7.16b, v5.16b, v5.16b, #8
172; CHECK-NEXT:    zip1 v0.2s, v5.2s, v7.2s
173; CHECK-NEXT:    zip2 v1.2s, v5.2s, v7.2s
174; CHECK-NEXT:    fmul v3.2s, v0.2s, v6.2s
175; CHECK-NEXT:    fmul v6.2s, v1.2s, v6.2s
176; CHECK-NEXT:    fmla v3.2s, v4.2s, v1.2s
177; CHECK-NEXT:    fneg v2.2s, v6.2s
178; CHECK-NEXT:    fmla v2.2s, v4.2s, v0.2s
179; CHECK-NEXT:    movi v0.2d, #0000000000000000
180; CHECK-NEXT:    st2 { v2.2s, v3.2s }, [x5]
181; CHECK-NEXT:    ldr q1, [x3]
182; CHECK-NEXT:    fcmla v0.4s, v5.4s, v1.4s, #0
183; CHECK-NEXT:    fcmla v0.4s, v5.4s, v1.4s, #90
184; CHECK-NEXT:    ret
185entry:
186  %a = load <4 x float>, ptr %ptr_a
187  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
188  %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
189  %b = load <4 x float>, ptr %ptr_b
190  %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
191  %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
192  %0 = fmul fast <2 x float> %strided.vec91, %strided.vec
193  %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88
194  %2 = fadd fast <2 x float> %0, %1
195  %3 = fmul fast <2 x float> %strided.vec90, %strided.vec
196  %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88
197  %5 = fsub fast <2 x float> %3, %4
198  store <2 x float> %5, ptr %p1
199  %c = load <4 x float>, ptr %ptr_c
200  %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
201  %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
202  %6 = fmul fast <2 x float> %strided.vec94, %5
203  %7 = fmul fast <2 x float> %strided.vec93, %2
204  %8 = fadd fast <2 x float> %6, %7
205  %9 = fmul fast <2 x float> %strided.vec93, %5
206  %10 = fmul fast <2 x float> %strided.vec94, %2
207  %11 = fsub fast <2 x float> %9, %10
208  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
209  store <4 x float> %interleaved.vec, ptr %p2
210  %d = load <4 x float>, ptr %ptr_d
211  %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
212  %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
213  %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94
214  %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93
215  %14 = fadd fast <2 x float> %13, %12
216  %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93
217  %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94
218  %17 = fsub fast <2 x float> %15, %16
219  %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
220  ret <4 x float> %interleaved.vec98
221}
222
223; Expected to not transform. Shows that external use prevents deinterleaving whole chain.
224;   *p1 = (a * b).real();
225;   *p2 = (a * b) * (d * c);
226;   return d * c;
227define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) {
228; CHECK-LABEL: multiple_muls_mul_external:
229; CHECK:       // %bb.0: // %entry
230; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
231; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
232; CHECK-NEXT:    ext v16.16b, v2.16b, v2.16b, #8
233; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #8
234; CHECK-NEXT:    zip2 v6.2s, v0.2s, v4.2s
235; CHECK-NEXT:    zip2 v7.2s, v1.2s, v5.2s
236; CHECK-NEXT:    zip1 v19.2s, v2.2s, v16.2s
237; CHECK-NEXT:    zip2 v2.2s, v2.2s, v16.2s
238; CHECK-NEXT:    zip2 v16.2s, v3.2s, v17.2s
239; CHECK-NEXT:    zip1 v0.2s, v0.2s, v4.2s
240; CHECK-NEXT:    zip1 v1.2s, v1.2s, v5.2s
241; CHECK-NEXT:    zip1 v3.2s, v3.2s, v17.2s
242; CHECK-NEXT:    fmul v18.2s, v6.2s, v7.2s
243; CHECK-NEXT:    fmul v5.2s, v19.2s, v16.2s
244; CHECK-NEXT:    fmul v16.2s, v2.2s, v16.2s
245; CHECK-NEXT:    fmul v7.2s, v0.2s, v7.2s
246; CHECK-NEXT:    fneg v4.2s, v18.2s
247; CHECK-NEXT:    fmla v5.2s, v3.2s, v2.2s
248; CHECK-NEXT:    fneg v2.2s, v16.2s
249; CHECK-NEXT:    fmla v7.2s, v1.2s, v6.2s
250; CHECK-NEXT:    fmla v4.2s, v1.2s, v0.2s
251; CHECK-NEXT:    fmla v2.2s, v3.2s, v19.2s
252; CHECK-NEXT:    fmul v0.2s, v7.2s, v5.2s
253; CHECK-NEXT:    fmul v17.2s, v4.2s, v5.2s
254; CHECK-NEXT:    str d4, [x0]
255; CHECK-NEXT:    fmla v17.2s, v2.2s, v7.2s
256; CHECK-NEXT:    fneg v16.2s, v0.2s
257; CHECK-NEXT:    zip1 v0.4s, v2.4s, v5.4s
258; CHECK-NEXT:    fmla v16.2s, v2.2s, v4.2s
259; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x1]
260; CHECK-NEXT:    ret
261entry:
262  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
263  %strided.vec126 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
264  %strided.vec128 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
265  %strided.vec129 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
266  %0 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec129
267  %1 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec128
268  %2 = fadd nnan ninf contract <2 x float> %1, %0
269  %3 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec128
270  %4 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec129
271  %5 = fsub nnan ninf contract <2 x float> %3, %4
272  store <2 x float> %5, ptr %p1
273  %strided.vec131 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
274  %strided.vec132 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
275  %strided.vec134 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
276  %strided.vec135 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
277  %6 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec135
278  %7 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec134
279  %8 = fadd nnan ninf contract <2 x float> %7, %6
280  %9 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec134
281  %10 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec135
282  %11 = fsub nnan ninf contract <2 x float> %9, %10
283  %12 = fmul nnan ninf contract <2 x float> %5, %8
284  %13 = fmul nnan ninf contract <2 x float> %2, %11
285  %14 = fadd nnan ninf contract <2 x float> %13, %12
286  %15 = fmul nnan ninf contract <2 x float> %5, %11
287  %16 = fmul nnan ninf contract <2 x float> %2, %8
288  %17 = fsub nnan ninf contract <2 x float> %15, %16
289  %interleaved.vec = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
290  store <4 x float> %interleaved.vec, ptr %p2
291  %interleaved.vec136 = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
292  ret <4 x float> %interleaved.vec136
293}
294
295; Expected to transform. Shows that composite common subexpression is not generated twice.
296;  u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]);
297;  v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]);
298define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) {
299; CHECK-LABEL: mul_add_common_mul_add_mul:
300; CHECK:       // %bb.0: // %entry
301; CHECK-NEXT:    movi v16.2d, #0000000000000000
302; CHECK-NEXT:    movi v17.2d, #0000000000000000
303; CHECK-NEXT:    ldr q19, [sp, #112]
304; CHECK-NEXT:    ldp q18, q20, [sp, #80]
305; CHECK-NEXT:    ldr q21, [sp, #64]
306; CHECK-NEXT:    movi v22.2d, #0000000000000000
307; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #0
308; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #0
309; CHECK-NEXT:    fcmla v22.2d, v1.2d, v3.2d, #0
310; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #90
311; CHECK-NEXT:    movi v18.2d, #0000000000000000
312; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #90
313; CHECK-NEXT:    fcmla v22.2d, v1.2d, v3.2d, #90
314; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #0
315; CHECK-NEXT:    fcmla v18.2d, v0.2d, v2.2d, #0
316; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #0
317; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #90
318; CHECK-NEXT:    fcmla v18.2d, v0.2d, v2.2d, #90
319; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #90
320; CHECK-NEXT:    ldp q3, q0, [sp, #32]
321; CHECK-NEXT:    ldp q2, q1, [sp]
322; CHECK-NEXT:    fsub v4.2d, v22.2d, v16.2d
323; CHECK-NEXT:    fsub v5.2d, v18.2d, v17.2d
324; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #0
325; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #0
326; CHECK-NEXT:    stp q5, q4, [x0]
327; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #90
328; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #90
329; CHECK-NEXT:    stp q17, q16, [x1]
330; CHECK-NEXT:    ret
331entry:
332  %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
333  %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
334  %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
335  %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
336  %0 = fmul fast <2 x double> %strided.vec125, %strided.vec
337  %1 = fmul fast <2 x double> %strided.vec126, %strided.vec
338  %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123
339  %3 = fadd fast <2 x double> %1, %2
340  %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
341  %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
342  %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
343  %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
344  %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128
345  %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129
346  %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128
347  %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129
348  %8 = fsub fast <2 x double> %4, %5
349  %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 0, i32 2>
350  %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 1, i32 3>
351  %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 0, i32 2>
352  %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 1, i32 3>
353  %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134
354  %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135
355  %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134
356  %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138
357  %13 = fsub fast <2 x double> %11, %12
358  %14 = fadd fast <2 x double> %13, %8
359  %15 = fadd fast <2 x double> %6, %7
360  %16 = fadd fast <2 x double> %15, %9
361  %17 = fadd fast <2 x double> %16, %10
362  %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123
363  %19 = fadd fast <2 x double> %18, %14
364  %20 = fsub fast <2 x double> %0, %19
365  %21 = fsub fast <2 x double> %3, %17
366  %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
367  store <4 x double> %interleaved.vec, ptr %p1, align 8
368  %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 0, i32 2>
369  %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 1, i32 3>
370  %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 0, i32 2>
371  %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 1, i32 3>
372  %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140
373  %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140
374  %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141
375  %25 = fadd fast <2 x double> %22, %14
376  %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141
377  %27 = fsub fast <2 x double> %25, %26
378  %28 = fadd fast <2 x double> %24, %17
379  %29 = fadd fast <2 x double> %28, %23
380  %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
381  store <4 x double> %interleaved.vec145, ptr %p2, align 8
382  ret void
383}
384