xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
3
4target triple = "thumbv8.1m.main-none-none-eabi"
5
6; Expected to transform
7define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
8; CHECK-LABEL: mul_mul:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    vcmul.f32 q3, q0, q1, #0
11; CHECK-NEXT:    vcmla.f32 q3, q0, q1, #90
12; CHECK-NEXT:    vcmul.f32 q0, q3, q2, #0
13; CHECK-NEXT:    vcmla.f32 q0, q3, q2, #90
14; CHECK-NEXT:    bx lr
15entry:
16  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
17  %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
18  %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
19  %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
20  %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151
21  %1 = fmul fast <2 x float> %strided.vec153, %strided.vec
22  %2 = fmul fast <2 x float> %strided.vec154, %strided.vec
23  %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151
24  %4 = fadd fast <2 x float> %3, %2
25  %5 = fsub fast <2 x float> %1, %0
26  %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
27  %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
28  %6 = fmul fast <2 x float> %4, %strided.vec156
29  %7 = fmul fast <2 x float> %5, %strided.vec157
30  %8 = fadd fast <2 x float> %6, %7
31  %9 = fmul fast <2 x float> %strided.vec156, %5
32  %10 = fmul fast <2 x float> %4, %strided.vec157
33  %11 = fsub fast <2 x float> %9, %10
34  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
35  ret <4 x float> %interleaved.vec
36}
37
38; Expected to not transform
39define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
40; CHECK-LABEL: add_mul:
41; CHECK:       @ %bb.0: @ %entry
42; CHECK-NEXT:    .vsave {d8, d9}
43; CHECK-NEXT:    vpush {d8, d9}
44; CHECK-NEXT:    vsub.f32 q3, q1, q2
45; CHECK-NEXT:    vsub.f32 q0, q1, q0
46; CHECK-NEXT:    vmov.f32 s16, s9
47; CHECK-NEXT:    vmov.f32 s13, s14
48; CHECK-NEXT:    vmov.f32 s17, s11
49; CHECK-NEXT:    vmov.f32 s0, s1
50; CHECK-NEXT:    vmul.f32 q1, q3, q4
51; CHECK-NEXT:    vmov.f32 s1, s3
52; CHECK-NEXT:    vmov.f32 s9, s10
53; CHECK-NEXT:    vfma.f32 q1, q2, q0
54; CHECK-NEXT:    vmul.f32 q0, q4, q0
55; CHECK-NEXT:    vneg.f32 q4, q0
56; CHECK-NEXT:    vmov.f32 s1, s4
57; CHECK-NEXT:    vfma.f32 q4, q2, q3
58; CHECK-NEXT:    vmov.f32 s3, s5
59; CHECK-NEXT:    vmov.f32 s0, s16
60; CHECK-NEXT:    vmov.f32 s2, s17
61; CHECK-NEXT:    vpop {d8, d9}
62; CHECK-NEXT:    bx lr
63entry:
64  %0 = fsub fast <4 x float> %b, %c
65  %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
66  %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
67  %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
68  %2 = fmul fast <2 x float> %1, %strided.vec59
69  %3 = fsub fast <4 x float> %b, %a
70  %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
71  %5 = fmul fast <2 x float> %strided.vec58, %4
72  %6 = fadd fast <2 x float> %5, %2
73  %7 = fmul fast <2 x float> %strided.vec58, %1
74  %8 = fmul fast <2 x float> %strided.vec59, %4
75  %9 = fsub fast <2 x float> %7, %8
76  %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
77  ret <4 x float> %interleaved.vec
78}
79
80; Expected to not transform
81define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
82; CHECK-LABEL: mul_mul270_mul:
83; CHECK:       @ %bb.0: @ %entry
84; CHECK-NEXT:    .vsave {d12}
85; CHECK-NEXT:    vpush {d12}
86; CHECK-NEXT:    .vsave {d10}
87; CHECK-NEXT:    vpush {d10}
88; CHECK-NEXT:    .vsave {d8}
89; CHECK-NEXT:    vpush {d8}
90; CHECK-NEXT:    vmov.f32 s20, s4
91; CHECK-NEXT:    vmov.f32 s16, s8
92; CHECK-NEXT:    vmov.f32 s17, s10
93; CHECK-NEXT:    vmov.f32 s21, s6
94; CHECK-NEXT:    vmul.f32 q3, q5, q4
95; CHECK-NEXT:    vmov.f32 s4, s5
96; CHECK-NEXT:    vneg.f32 q3, q3
97; CHECK-NEXT:    vmov.f32 s24, s9
98; CHECK-NEXT:    vmov.f32 s25, s11
99; CHECK-NEXT:    vmov.f32 s5, s7
100; CHECK-NEXT:    vmul.f32 q2, q1, q4
101; CHECK-NEXT:    vmov.f32 s16, s0
102; CHECK-NEXT:    vfma.f32 q3, q1, q6
103; CHECK-NEXT:    vmov.f32 s17, s2
104; CHECK-NEXT:    vmov.f32 s0, s1
105; CHECK-NEXT:    vfma.f32 q2, q5, q6
106; CHECK-NEXT:    vmul.f32 q1, q3, q4
107; CHECK-NEXT:    vmov.f32 s1, s3
108; CHECK-NEXT:    vfma.f32 q1, q2, q0
109; CHECK-NEXT:    vmul.f32 q0, q3, q0
110; CHECK-NEXT:    vneg.f32 q3, q0
111; CHECK-NEXT:    vmov.f32 s1, s4
112; CHECK-NEXT:    vfma.f32 q3, q2, q4
113; CHECK-NEXT:    vmov.f32 s3, s5
114; CHECK-NEXT:    vmov.f32 s0, s12
115; CHECK-NEXT:    vmov.f32 s2, s13
116; CHECK-NEXT:    vpop {d8}
117; CHECK-NEXT:    vpop {d10}
118; CHECK-NEXT:    vpop {d12}
119; CHECK-NEXT:    bx lr
120entry:
121  %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
122  %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
123  %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
124  %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
125  %0 = fmul fast <2 x float> %strided.vec84, %strided.vec
126  %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81
127  %2 = fadd fast <2 x float> %1, %0
128  %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
129  %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
130  %3 = fmul fast <2 x float> %2, %strided.vec87
131  %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81
132  %5 = fmul fast <2 x float> %strided.vec83, %strided.vec
133  %6 = fsub fast <2 x float> %4, %5
134  %7 = fmul fast <2 x float> %6, %strided.vec86
135  %8 = fadd fast <2 x float> %3, %7
136  %9 = fmul fast <2 x float> %2, %strided.vec86
137  %10 = fmul fast <2 x float> %6, %strided.vec87
138  %11 = fsub fast <2 x float> %9, %10
139  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
140  ret <4 x float> %interleaved.vec
141}
142
143; (a * b) * a
144; Expected to transform
145define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) {
146; CHECK-LABEL: mul_triangle:
147; CHECK:       @ %bb.0: @ %entry
148; CHECK-NEXT:    vcmul.f32 q2, q1, q0, #0
149; CHECK-NEXT:    vcmla.f32 q2, q1, q0, #90
150; CHECK-NEXT:    vcmul.f32 q1, q0, q2, #0
151; CHECK-NEXT:    vcmla.f32 q1, q0, q2, #90
152; CHECK-NEXT:    vmov q0, q1
153; CHECK-NEXT:    bx lr
154entry:
155  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
156  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
157  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
158  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
159  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
160  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
161  %2 = fsub fast <2 x float> %0, %1
162  %3 = fmul fast <2 x float> %2, %strided.vec35
163  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
164  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
165  %6 = fadd fast <2 x float> %4, %5
166  %7 = fmul fast <2 x float> %6, %strided.vec
167  %8 = fadd fast <2 x float> %3, %7
168  %9 = fmul fast <2 x float> %2, %strided.vec
169  %10 = fmul fast <2 x float> %6, %strided.vec35
170  %11 = fsub fast <2 x float> %9, %10
171  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
172  ret <4 x float> %interleaved.vec
173}
174
175
176; d * (b * a) * (c * a)
177; Expected to transform
178define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
179; CHECK-LABEL: mul_diamond:
180; CHECK:       @ %bb.0: @ %entry
181; CHECK-NEXT:    .vsave {d8, d9}
182; CHECK-NEXT:    vpush {d8, d9}
183; CHECK-NEXT:    vcmul.f32 q4, q1, q0, #0
184; CHECK-NEXT:    vcmla.f32 q4, q1, q0, #90
185; CHECK-NEXT:    vcmul.f32 q1, q4, q3, #0
186; CHECK-NEXT:    vcmla.f32 q1, q4, q3, #90
187; CHECK-NEXT:    vcmul.f32 q3, q2, q0, #0
188; CHECK-NEXT:    vcmla.f32 q3, q2, q0, #90
189; CHECK-NEXT:    vcmul.f32 q0, q3, q1, #0
190; CHECK-NEXT:    vcmla.f32 q0, q3, q1, #90
191; CHECK-NEXT:    vpop {d8, d9}
192; CHECK-NEXT:    bx lr
193entry:
194  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
195  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
196  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
197  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
198  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
199  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
200  %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
201  %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
202  %0 = fmul fast <2 x float> %a.imag, %b.real
203  %1 = fmul fast <2 x float> %a.real, %b.imag
204  %2 = fadd fast <2 x float> %1, %0
205  %3 = fmul fast <2 x float> %a.real, %b.real
206  %4 = fmul fast <2 x float> %b.imag, %a.imag
207  %5 = fsub fast <2 x float> %3, %4
208  %6 = fmul fast <2 x float> %d.real, %5
209  %7 = fmul fast <2 x float> %2, %d.imag
210  %8 = fmul fast <2 x float> %d.real, %2
211  %9 = fmul fast <2 x float> %5, %d.imag
212  %10 = fsub fast <2 x float> %6, %7
213  %11 = fadd fast <2 x float> %8, %9
214  %12 = fmul fast <2 x float> %c.real, %a.imag
215  %13 = fmul fast <2 x float> %c.imag, %a.real
216  %14 = fadd fast <2 x float> %13, %12
217  %15 = fmul fast <2 x float> %14, %10
218  %16 = fmul fast <2 x float> %c.real, %a.real
219  %17 = fmul fast <2 x float> %c.imag, %a.imag
220  %18 = fsub fast <2 x float> %16, %17
221  %19 = fmul fast <2 x float> %18, %11
222  %20 = fadd fast <2 x float> %15, %19
223  %21 = fmul fast <2 x float> %18, %10
224  %22 = fmul fast <2 x float> %14, %11
225  %23 = fsub fast <2 x float> %21, %22
226  %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
227  ret <4 x float> %interleaved.vec
228}
229
230; Expected to transform
231define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
232; CHECK-LABEL: mul_add90_mul:
233; CHECK:       @ %bb.0: @ %entry
234; CHECK-NEXT:    .vsave {d8, d9}
235; CHECK-NEXT:    vpush {d8, d9}
236; CHECK-NEXT:    vcmul.f32 q3, q2, q0, #0
237; CHECK-NEXT:    vcmul.f32 q4, q1, q0, #0
238; CHECK-NEXT:    vcmla.f32 q4, q1, q0, #90
239; CHECK-NEXT:    vcmla.f32 q3, q2, q0, #90
240; CHECK-NEXT:    vcadd.f32 q0, q3, q4, #90
241; CHECK-NEXT:    vpop {d8, d9}
242; CHECK-NEXT:    bx lr
243entry:
244  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
245  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
246  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
247  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
248  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
249  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
250
251  %i6 = fmul fast <2 x float> %br, %ar
252  %i7 = fmul fast <2 x float> %bi, %ai
253  %xr = fsub fast <2 x float> %i6, %i7
254  %i9 = fmul fast <2 x float> %bi, %ar
255  %i10 = fmul fast <2 x float> %br, %ai
256  %xi = fadd fast <2 x float> %i9, %i10
257
258  %j6 = fmul fast <2 x float> %cr, %ar
259  %j7 = fmul fast <2 x float> %ci, %ai
260  %yr = fsub fast <2 x float> %j6, %j7
261  %j9 = fmul fast <2 x float> %ci, %ar
262  %j10 = fmul fast <2 x float> %cr, %ai
263  %yi = fadd fast <2 x float> %j9, %j10
264
265  %zr = fsub fast <2 x float> %yr, %xi
266  %zi = fadd fast <2 x float> %yi, %xr
267  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
268  ret <4 x float> %interleaved.vec
269}
270
271; Expected to not transform
272define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
273; CHECK-LABEL: mul_triangle_addmul:
274; CHECK:       @ %bb.0: @ %entry
275; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
276; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
277; CHECK-NEXT:    vmov.f32 s16, s0
278; CHECK-NEXT:    vmov.f32 s20, s5
279; CHECK-NEXT:    vmov.f32 s17, s2
280; CHECK-NEXT:    vmov.f32 s21, s7
281; CHECK-NEXT:    vmov.f32 s5, s6
282; CHECK-NEXT:    vmul.f32 q3, q5, q4
283; CHECK-NEXT:    vmul.f32 q4, q1, q4
284; CHECK-NEXT:    vmov.f32 s0, s1
285; CHECK-NEXT:    vmov.f32 s1, s3
286; CHECK-NEXT:    vmov q6, q4
287; CHECK-NEXT:    vfms.f32 q6, q5, q0
288; CHECK-NEXT:    vmov q7, q3
289; CHECK-NEXT:    vfma.f32 q3, q1, q0
290; CHECK-NEXT:    vmov.f32 s20, s8
291; CHECK-NEXT:    vmov.f32 s21, s10
292; CHECK-NEXT:    vmov.f32 s4, s9
293; CHECK-NEXT:    vfma.f32 q7, q5, q0
294; CHECK-NEXT:    vmov.f32 s5, s11
295; CHECK-NEXT:    vadd.f32 q5, q7, q6
296; CHECK-NEXT:    vfms.f32 q4, q1, q0
297; CHECK-NEXT:    vmov.f32 s1, s20
298; CHECK-NEXT:    vsub.f32 q1, q4, q3
299; CHECK-NEXT:    vmov.f32 s3, s21
300; CHECK-NEXT:    vmov.f32 s0, s4
301; CHECK-NEXT:    vmov.f32 s2, s5
302; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
303; CHECK-NEXT:    bx lr
304entry:
305  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
306  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
307  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
310  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
311
312  %i6 = fmul fast <2 x float> %br, %ar
313  %i7 = fmul fast <2 x float> %bi, %ai
314  %xr = fsub fast <2 x float> %i6, %i7
315  %i9 = fmul fast <2 x float> %bi, %ar
316  %i10 = fmul fast <2 x float> %br, %ai
317  %xi = fadd fast <2 x float> %i9, %i10
318
319  ;%j6 = fmul fast <2 x float> %cr, %ar
320  %j7 = fmul fast <2 x float> %ci, %ai
321  %yr = fsub fast <2 x float> %i6, %j7
322  ;%j9 = fmul fast <2 x float> %ci, %ar
323  %j10 = fmul fast <2 x float> %cr, %ai
324  %yi = fadd fast <2 x float> %i9, %j10
325
326  %zr = fsub fast <2 x float> %yr, %xi
327  %zi = fadd fast <2 x float> %yi, %xr
328  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
329  ret <4 x float> %interleaved.vec
330}
331
332; Expected to not transform
333define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) {
334; CHECK-LABEL: mul_triangle_multiuses:
335; CHECK:       @ %bb.0: @ %entry
336; CHECK-NEXT:    .vsave {d8, d9}
337; CHECK-NEXT:    vpush {d8, d9}
338; CHECK-NEXT:    vmov.f32 s16, s4
339; CHECK-NEXT:    vmov.f32 s8, s1
340; CHECK-NEXT:    vmov.f32 s17, s6
341; CHECK-NEXT:    vmov.f32 s9, s3
342; CHECK-NEXT:    vmov.f32 s4, s5
343; CHECK-NEXT:    vmul.f32 q3, q2, q4
344; CHECK-NEXT:    vmov.f32 s1, s2
345; CHECK-NEXT:    vmov.f32 s5, s7
346; CHECK-NEXT:    vfma.f32 q3, q1, q0
347; CHECK-NEXT:    vmul.f32 q1, q1, q2
348; CHECK-NEXT:    vneg.f32 q1, q1
349; CHECK-NEXT:    vfma.f32 q1, q4, q0
350; CHECK-NEXT:    vmov.f32 s18, s12
351; CHECK-NEXT:    vmov.f32 s16, s4
352; CHECK-NEXT:    vmov.f32 s17, s5
353; CHECK-NEXT:    vmov.f32 s19, s13
354; CHECK-NEXT:    vstrw.32 q4, [r0]
355; CHECK-NEXT:    vmul.f32 q4, q3, q0
356; CHECK-NEXT:    vfma.f32 q4, q1, q2
357; CHECK-NEXT:    vmul.f32 q2, q3, q2
358; CHECK-NEXT:    vneg.f32 q2, q2
359; CHECK-NEXT:    vfma.f32 q2, q1, q0
360; CHECK-NEXT:    vmov.f32 s1, s16
361; CHECK-NEXT:    vmov.f32 s0, s8
362; CHECK-NEXT:    vmov.f32 s2, s9
363; CHECK-NEXT:    vmov.f32 s3, s17
364; CHECK-NEXT:    vpop {d8, d9}
365; CHECK-NEXT:    bx lr
366entry:
367  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
368  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
369  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
370  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
371  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
372  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
373  %2 = fsub fast <2 x float> %0, %1
374  %3 = fmul fast <2 x float> %2, %strided.vec35
375  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
376  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
377  %6 = fadd fast <2 x float> %4, %5
378  %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
379  store <4 x float> %otheruse, ptr %p
380  %7 = fmul fast <2 x float> %6, %strided.vec
381  %8 = fadd fast <2 x float> %3, %7
382  %9 = fmul fast <2 x float> %2, %strided.vec
383  %10 = fmul fast <2 x float> %6, %strided.vec35
384  %11 = fsub fast <2 x float> %9, %10
385  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
386  ret <4 x float> %interleaved.vec
387}
388
389; Expected to transform
390define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
391; CHECK-LABEL: mul_addequal:
392; CHECK:       @ %bb.0: @ %entry
393; CHECK-NEXT:    add.w r12, sp, #16
394; CHECK-NEXT:    vmov d0, r0, r1
395; CHECK-NEXT:    mov r0, sp
396; CHECK-NEXT:    vldrw.u32 q2, [r12]
397; CHECK-NEXT:    vldrw.u32 q1, [r0]
398; CHECK-NEXT:    vmov d1, r2, r3
399; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #0
400; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
401; CHECK-NEXT:    vmov r0, r1, d4
402; CHECK-NEXT:    vmov r2, r3, d5
403; CHECK-NEXT:    bx lr
404entry:
405  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
406  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
407  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
408  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
409  %0 = fmul fast <2 x float> %b.imag, %strided.vec
410  %1 = fmul fast <2 x float> %b.real, %a.imag
411  %2 = fadd fast <2 x float> %1, %0
412  %3 = fmul fast <2 x float> %b.real, %strided.vec
413  %4 = fmul fast <2 x float> %a.imag, %b.imag
414  %5 = fsub fast <2 x float> %3, %4
415  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
416  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
417  %6 = fadd fast <2 x float> %5, %c.real
418  %7 = fadd fast <2 x float> %2, %c.imag
419  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
420  ret <4 x float> %interleaved.vec
421}
422
423; Expected to transform
424define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
425; CHECK-LABEL: mul_subequal:
426; CHECK:       @ %bb.0: @ %entry
427; CHECK-NEXT:    vmov d0, r0, r1
428; CHECK-NEXT:    mov r1, sp
429; CHECK-NEXT:    vldrw.u32 q2, [r1]
430; CHECK-NEXT:    vmov d1, r2, r3
431; CHECK-NEXT:    add r0, sp, #16
432; CHECK-NEXT:    vcmul.f32 q3, q0, q2, #0
433; CHECK-NEXT:    vldrw.u32 q1, [r0]
434; CHECK-NEXT:    vcmla.f32 q3, q0, q2, #90
435; CHECK-NEXT:    vsub.f32 q0, q3, q1
436; CHECK-NEXT:    vmov r0, r1, d0
437; CHECK-NEXT:    vmov r2, r3, d1
438; CHECK-NEXT:    bx lr
439entry:
440  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
441  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
442  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
443  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
444  %0 = fmul fast <2 x float> %b.imag, %strided.vec
445  %1 = fmul fast <2 x float> %b.real, %a.imag
446  %2 = fadd fast <2 x float> %1, %0
447  %3 = fmul fast <2 x float> %b.real, %strided.vec
448  %4 = fmul fast <2 x float> %a.imag, %b.imag
449  %5 = fsub fast <2 x float> %3, %4
450  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
451  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
452  %6 = fsub fast <2 x float> %5, %c.real
453  %7 = fsub fast <2 x float> %2, %c.imag
454  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
455  ret <4 x float> %interleaved.vec
456}
457
458
459; Expected to transform
460define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
461; CHECK-LABEL: mul_mulequal:
462; CHECK:       @ %bb.0: @ %entry
463; CHECK-NEXT:    vmov d0, r0, r1
464; CHECK-NEXT:    mov r1, sp
465; CHECK-NEXT:    vldrw.u32 q2, [r1]
466; CHECK-NEXT:    vmov d1, r2, r3
467; CHECK-NEXT:    add r0, sp, #16
468; CHECK-NEXT:    vcmul.f32 q3, q0, q2, #0
469; CHECK-NEXT:    vldrw.u32 q1, [r0]
470; CHECK-NEXT:    vcmla.f32 q3, q0, q2, #90
471; CHECK-NEXT:    vmul.f32 q0, q3, q1
472; CHECK-NEXT:    vmov r0, r1, d0
473; CHECK-NEXT:    vmov r2, r3, d1
474; CHECK-NEXT:    bx lr
475entry:
476  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
477  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
478  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
479  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
480  %0 = fmul fast <2 x float> %b.imag, %strided.vec
481  %1 = fmul fast <2 x float> %b.real, %a.imag
482  %2 = fadd fast <2 x float> %1, %0
483  %3 = fmul fast <2 x float> %b.real, %strided.vec
484  %4 = fmul fast <2 x float> %a.imag, %b.imag
485  %5 = fsub fast <2 x float> %3, %4
486  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
487  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
488  %6 = fmul fast <2 x float> %5, %c.real
489  %7 = fmul fast <2 x float> %2, %c.imag
490  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
491  ret <4 x float> %interleaved.vec
492}
493
494; Expected to not transform
495define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
496; CHECK-LABEL: mul_divequal:
497; CHECK:       @ %bb.0: @ %entry
498; CHECK-NEXT:    .vsave {d10, d11}
499; CHECK-NEXT:    vpush {d10, d11}
500; CHECK-NEXT:    .vsave {d8}
501; CHECK-NEXT:    vpush {d8}
502; CHECK-NEXT:    vmov d0, r0, r1
503; CHECK-NEXT:    add r0, sp, #24
504; CHECK-NEXT:    vldrw.u32 q1, [r0]
505; CHECK-NEXT:    vmov d1, r2, r3
506; CHECK-NEXT:    vmov.f32 s16, s1
507; CHECK-NEXT:    add.w r12, sp, #40
508; CHECK-NEXT:    vmov.f32 s12, s5
509; CHECK-NEXT:    vmov.f32 s13, s7
510; CHECK-NEXT:    vmov.f32 s1, s2
511; CHECK-NEXT:    vmov.f32 s8, s4
512; CHECK-NEXT:    vmul.f32 q5, q3, q0
513; CHECK-NEXT:    vmov.f32 s9, s6
514; CHECK-NEXT:    vldrw.u32 q1, [r12]
515; CHECK-NEXT:    vmov.f32 s17, s3
516; CHECK-NEXT:    vfma.f32 q5, q2, q4
517; CHECK-NEXT:    vmul.f32 q3, q4, q3
518; CHECK-NEXT:    vdiv.f32 s3, s21, s7
519; CHECK-NEXT:    vneg.f32 q3, q3
520; CHECK-NEXT:    vfma.f32 q3, q2, q0
521; CHECK-NEXT:    vdiv.f32 s1, s20, s5
522; CHECK-NEXT:    vdiv.f32 s2, s13, s6
523; CHECK-NEXT:    vdiv.f32 s0, s12, s4
524; CHECK-NEXT:    vmov r0, r1, d0
525; CHECK-NEXT:    vmov r2, r3, d1
526; CHECK-NEXT:    vpop {d8}
527; CHECK-NEXT:    vpop {d10, d11}
528; CHECK-NEXT:    bx lr
529entry:
530  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
531  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
532  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
533  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
534  %0 = fmul fast <2 x float> %b.imag, %strided.vec
535  %1 = fmul fast <2 x float> %b.real, %a.imag
536  %2 = fadd fast <2 x float> %1, %0
537  %3 = fmul fast <2 x float> %b.real, %strided.vec
538  %4 = fmul fast <2 x float> %a.imag, %b.imag
539  %5 = fsub fast <2 x float> %3, %4
540  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
541  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
542  %6 = fdiv fast <2 x float> %5, %c.real
543  %7 = fdiv fast <2 x float> %2, %c.imag
544  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
545  ret <4 x float> %interleaved.vec
546}
547
548; Expected to transform
549define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
550; CHECK-LABEL: mul_negequal:
551; CHECK:       @ %bb.0: @ %entry
552; CHECK-NEXT:    vmov d0, r0, r1
553; CHECK-NEXT:    mov r0, sp
554; CHECK-NEXT:    vldrw.u32 q1, [r0]
555; CHECK-NEXT:    vmov d1, r2, r3
556; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #180
557; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #270
558; CHECK-NEXT:    vmov r0, r1, d4
559; CHECK-NEXT:    vmov r2, r3, d5
560; CHECK-NEXT:    bx lr
561entry:
562  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
563  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
564  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
565  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
566  %0 = fmul fast <2 x float> %b.imag, %strided.vec
567  %1 = fmul fast <2 x float> %b.real, %a.imag
568  %2 = fadd fast <2 x float> %1, %0
569  %3 = fmul fast <2 x float> %b.real, %strided.vec
570  %4 = fmul fast <2 x float> %a.imag, %b.imag
571  %5 = fsub fast <2 x float> %3, %4
572  %6 = fneg fast <2 x float> %5
573  %7 = fneg fast <2 x float> %2
574  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
575  ret <4 x float> %interleaved.vec
576}
577