xref: /llvm-project/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll (revision bfc0317153dca75137fba00b5c28758d6f720963)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
3
4target triple = "aarch64"
5
6; a * b + c
7define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
8; CHECK-LABEL: mull_add:
9; CHECK:       // %bb.0: // %entry
10; CHECK-NEXT:    ptrue p0.d
11; CHECK-NEXT:    fcmla z4.d, p0/m, z0.d, z2.d, #0
12; CHECK-NEXT:    fcmla z5.d, p0/m, z1.d, z3.d, #0
13; CHECK-NEXT:    fcmla z4.d, p0/m, z0.d, z2.d, #90
14; CHECK-NEXT:    fcmla z5.d, p0/m, z1.d, z3.d, #90
15; CHECK-NEXT:    mov z0.d, z4.d
16; CHECK-NEXT:    mov z1.d, z5.d
17; CHECK-NEXT:    ret
18entry:
19  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
20  %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
21  %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
22  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
23  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
24  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
25  %4 = fmul fast <vscale x 2 x double> %3, %0
26  %5 = fmul fast <vscale x 2 x double> %2, %1
27  %6 = fadd fast <vscale x 2 x double> %4, %5
28  %7 = fmul fast <vscale x 2 x double> %2, %0
29  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
30  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
31  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
32  %10 = fadd fast <vscale x 2 x double> %8, %7
33  %11 = fmul fast <vscale x 2 x double> %3, %1
34  %12 = fsub fast <vscale x 2 x double> %10, %11
35  %13 = fadd fast <vscale x 2 x double> %6, %9
36  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
37  ret <vscale x 4 x double> %interleaved.vec
38}
39
40; a * b + c * d
41define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
42; CHECK-LABEL: mul_add_mull:
43; CHECK:       // %bb.0: // %entry
44; CHECK-NEXT:    mov z24.d, #0 // =0x0
45; CHECK-NEXT:    ptrue p0.d
46; CHECK-NEXT:    mov z25.d, z24.d
47; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
48; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #0
49; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
50; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
51; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
52; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #90
53; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
54; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
55; CHECK-NEXT:    mov z1.d, z24.d
56; CHECK-NEXT:    mov z0.d, z25.d
57; CHECK-NEXT:    ret
58entry:
59  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
60  %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
61  %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
62  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
63  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
64  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
65  %4 = fmul fast <vscale x 2 x double> %3, %0
66  %5 = fmul fast <vscale x 2 x double> %2, %1
67  %6 = fmul fast <vscale x 2 x double> %2, %0
68  %7 = fmul fast <vscale x 2 x double> %3, %1
69  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
70  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
71  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
72  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
73  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
74  %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
75  %12 = fmul fast <vscale x 2 x double> %11, %8
76  %13 = fmul fast <vscale x 2 x double> %10, %9
77  %14 = fmul fast <vscale x 2 x double> %10, %8
78  %15 = fmul fast <vscale x 2 x double> %11, %9
79  %16 = fadd fast <vscale x 2 x double> %15, %7
80  %17 = fadd fast <vscale x 2 x double> %14, %6
81  %18 = fsub fast <vscale x 2 x double> %17, %16
82  %19 = fadd fast <vscale x 2 x double> %4, %5
83  %20 = fadd fast <vscale x 2 x double> %19, %13
84  %21 = fadd fast <vscale x 2 x double> %20, %12
85  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
86  ret <vscale x 4 x double> %interleaved.vec
87}
88
89; a * b - c * d
90define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
91; CHECK-LABEL: mul_sub_mull:
92; CHECK:       // %bb.0: // %entry
93; CHECK-NEXT:    mov z24.d, #0 // =0x0
94; CHECK-NEXT:    ptrue p0.d
95; CHECK-NEXT:    mov z25.d, z24.d
96; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #270
97; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #270
98; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
99; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
100; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #180
101; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #180
102; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
103; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
104; CHECK-NEXT:    mov z1.d, z24.d
105; CHECK-NEXT:    mov z0.d, z25.d
106; CHECK-NEXT:    ret
107entry:
108  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
109  %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
110  %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
111  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
112  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
113  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
114  %4 = fmul fast <vscale x 2 x double> %3, %0
115  %5 = fmul fast <vscale x 2 x double> %2, %1
116  %6 = fmul fast <vscale x 2 x double> %2, %0
117  %7 = fmul fast <vscale x 2 x double> %3, %1
118  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
119  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
120  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
121  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
122  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 0
123  %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 1
124  %12 = fmul fast <vscale x 2 x double> %11, %9
125  %13 = fmul fast <vscale x 2 x double> %10, %8
126  %14 = fadd fast <vscale x 2 x double> %13, %7
127  %15 = fadd fast <vscale x 2 x double> %12, %6
128  %16 = fsub fast <vscale x 2 x double> %15, %14
129  %17 = fmul fast <vscale x 2 x double> %10, %9
130  %18 = fmul fast <vscale x 2 x double> %11, %8
131  %19 = fadd fast <vscale x 2 x double> %18, %17
132  %20 = fadd fast <vscale x 2 x double> %4, %5
133  %21 = fsub fast <vscale x 2 x double> %20, %19
134  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
135  ret <vscale x 4 x double> %interleaved.vec
136}
137
138; a * b + conj(c) * d
139define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
140; CHECK-LABEL: mul_conj_mull:
141; CHECK:       // %bb.0: // %entry
142; CHECK-NEXT:    mov z24.d, #0 // =0x0
143; CHECK-NEXT:    ptrue p0.d
144; CHECK-NEXT:    mov z25.d, z24.d
145; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
146; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
147; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
148; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
149; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
150; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #0
151; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
152; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #270
153; CHECK-NEXT:    mov z1.d, z24.d
154; CHECK-NEXT:    mov z0.d, z25.d
155; CHECK-NEXT:    ret
156entry:
157  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
158  %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
159  %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
160  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
161  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
162  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
163  %4 = fmul fast <vscale x 2 x double> %3, %0
164  %5 = fmul fast <vscale x 2 x double> %2, %1
165  %6 = fmul fast <vscale x 2 x double> %2, %0
166  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
167  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
168  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
169  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
170  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
171  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
172  %11 = fmul fast <vscale x 2 x double> %10, %7
173  %12 = fmul fast <vscale x 2 x double> %9, %7
174  %13 = fmul fast <vscale x 2 x double> %10, %8
175  %14 = fmul fast <vscale x 2 x double> %3, %1
176  %15 = fsub fast <vscale x 2 x double> %6, %14
177  %16 = fadd fast <vscale x 2 x double> %15, %12
178  %17 = fadd fast <vscale x 2 x double> %16, %13
179  %18 = fadd fast <vscale x 2 x double> %4, %5
180  %19 = fmul fast <vscale x 2 x double> %9, %8
181  %20 = fsub fast <vscale x 2 x double> %18, %19
182  %21 = fadd fast <vscale x 2 x double> %20, %11
183  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
184  ret <vscale x 4 x double> %interleaved.vec
185}
186
187; a + b + 1i * c * d
188define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
189; CHECK-LABEL: mul_add_rot_mull:
190; CHECK:       // %bb.0: // %entry
191; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
192; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
193; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
194; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
195; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
196; CHECK-NEXT:    uzp1 z26.d, z6.d, z7.d
197; CHECK-NEXT:    ptrue p0.d
198; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
199; CHECK-NEXT:    uzp2 z5.d, z6.d, z7.d
200; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
201; CHECK-NEXT:    fmul z25.d, z24.d, z25.d
202; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z0.d
203; CHECK-NEXT:    movprfx z24, z25
204; CHECK-NEXT:    fmla z24.d, p0/m, z26.d, z1.d
205; CHECK-NEXT:    movprfx z6, z24
206; CHECK-NEXT:    fmla z6.d, p0/m, z5.d, z4.d
207; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z4.d
208; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z6.d
209; CHECK-NEXT:    fmsb z1.d, p0/m, z5.d, z3.d
210; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
211; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
212; CHECK-NEXT:    ret
213entry:
214  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
215  %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
216  %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
217  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
218  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
219  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
220  %4 = fmul fast <vscale x 2 x double> %3, %0
221  %5 = fmul fast <vscale x 2 x double> %2, %1
222  %6 = fmul fast <vscale x 2 x double> %2, %0
223  %7 = fmul fast <vscale x 2 x double> %3, %1
224  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
225  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
226  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
227  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
228  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 0
229  %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 1
230  %12 = fmul fast <vscale x 2 x double> %10, %8
231  %13 = fmul fast <vscale x 2 x double> %10, %9
232  %14 = fmul fast <vscale x 2 x double> %11, %8
233  %15 = fadd fast <vscale x 2 x double> %13, %7
234  %16 = fadd fast <vscale x 2 x double> %15, %14
235  %17 = fsub fast <vscale x 2 x double> %6, %16
236  %18 = fadd fast <vscale x 2 x double> %4, %5
237  %19 = fadd fast <vscale x 2 x double> %18, %12
238  %20 = fmul fast <vscale x 2 x double> %11, %9
239  %21 = fsub fast <vscale x 2 x double> %19, %20
240  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
241  ret <vscale x 4 x double> %interleaved.vec
242}
243
244declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
245declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
246