xref: /llvm-project/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll (revision edc1c3d24e6f8ed548340ce0369138fb40427a24)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
3; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
4; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
5; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
6
7define float @mul_HalfS(<2 x float> %bin.rdx)  {
8; CHECK-SD-LABEL: mul_HalfS:
9; CHECK-SD:       // %bb.0:
10; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
11; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
12; CHECK-SD-NEXT:    ret
13;
14; CHECK-GI-LABEL: mul_HalfS:
15; CHECK-GI:       // %bb.0:
16; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
17; CHECK-GI-NEXT:    mov s1, v0.s[1]
18; CHECK-GI-NEXT:    fmul s0, s0, s1
19; CHECK-GI-NEXT:    ret
20  %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
21  ret float %r
22}
23
24define half @mul_HalfH(<4 x half> %bin.rdx)  {
25; CHECK-SD-NOFP16-LABEL: mul_HalfH:
26; CHECK-SD-NOFP16:       // %bb.0:
27; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
28; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
29; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
30; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
31; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
32; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
33; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
34; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
35; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
36; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
37; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
38; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
39; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
40; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
41; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
42; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
43; CHECK-SD-NOFP16-NEXT:    ret
44;
45; CHECK-SD-FP16-LABEL: mul_HalfH:
46; CHECK-SD-FP16:       // %bb.0:
47; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
48; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
49; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
50; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
51; CHECK-SD-FP16-NEXT:    ret
52;
53; CHECK-GI-NOFP16-LABEL: mul_HalfH:
54; CHECK-GI-NOFP16:       // %bb.0:
55; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
56; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
57; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
58; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
59; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
60; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
61; CHECK-GI-NOFP16-NEXT:    ret
62;
63; CHECK-GI-FP16-LABEL: mul_HalfH:
64; CHECK-GI-FP16:       // %bb.0:
65; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
66; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
67; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
68; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
69; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
70; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
71; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
72; CHECK-GI-FP16-NEXT:    ret
73  %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
74  ret half %r
75}
76
77
78define half @mul_H(<8 x half> %bin.rdx)  {
79; CHECK-SD-NOFP16-LABEL: mul_H:
80; CHECK-SD-NOFP16:       // %bb.0:
81; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
82; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
83; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
84; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
85; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
86; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
87; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
88; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
89; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
90; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
91; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
92; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
93; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
94; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
95; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
96; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
97; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
98; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
99; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
100; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
101; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
102; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
103; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
104; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
105; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
106; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
107; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
108; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
109; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
110; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
111; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
112; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
113; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
114; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
115; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
116; CHECK-SD-NOFP16-NEXT:    ret
117;
118; CHECK-SD-FP16-LABEL: mul_H:
119; CHECK-SD-FP16:       // %bb.0:
120; CHECK-SD-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
121; CHECK-SD-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
122; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
123; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
124; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
125; CHECK-SD-FP16-NEXT:    ret
126;
127; CHECK-GI-NOFP16-LABEL: mul_H:
128; CHECK-GI-NOFP16:       // %bb.0:
129; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
130; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
131; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
132; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
133; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
134; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
135; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
136; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
137; CHECK-GI-NOFP16-NEXT:    ret
138;
139; CHECK-GI-FP16-LABEL: mul_H:
140; CHECK-GI-FP16:       // %bb.0:
141; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
142; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
143; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
144; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
145; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
146; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
147; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
148; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
149; CHECK-GI-FP16-NEXT:    ret
150  %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
151  ret half %r
152}
153
154define float @mul_S(<4 x float> %bin.rdx)  {
155; CHECK-SD-LABEL: mul_S:
156; CHECK-SD:       // %bb.0:
157; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
158; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
159; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
160; CHECK-SD-NEXT:    ret
161;
162; CHECK-GI-LABEL: mul_S:
163; CHECK-GI:       // %bb.0:
164; CHECK-GI-NEXT:    mov d1, v0.d[1]
165; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
166; CHECK-GI-NEXT:    mov s1, v0.s[1]
167; CHECK-GI-NEXT:    fmul s0, s0, s1
168; CHECK-GI-NEXT:    ret
169  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
170  ret float %r
171}
172
173define double @mul_D(<2 x double> %bin.rdx)  {
174; CHECK-LABEL: mul_D:
175; CHECK:       // %bb.0:
176; CHECK-NEXT:    fmul d0, d0, v0.d[1]
177; CHECK-NEXT:    ret
178  %r = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
179  ret double %r
180}
181
182define half @mul_2H(<16 x half> %bin.rdx)  {
183; CHECK-SD-NOFP16-LABEL: mul_2H:
184; CHECK-SD-NOFP16:       // %bb.0:
185; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
186; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
187; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
188; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
189; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
190; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
191; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
192; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
193; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
194; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
195; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
196; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
197; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
198; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
199; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
200; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
201; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
202; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
203; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
204; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
205; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
206; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
207; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
208; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
209; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
210; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
211; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
212; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
213; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
214; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
215; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
216; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
217; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
218; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
219; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
220; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
221; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
222; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
223; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
224; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
225; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
226; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
227; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
228; CHECK-SD-NOFP16-NEXT:    ret
229;
230; CHECK-SD-FP16-LABEL: mul_2H:
231; CHECK-SD-FP16:       // %bb.0:
232; CHECK-SD-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
233; CHECK-SD-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
234; CHECK-SD-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
235; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
236; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
237; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
238; CHECK-SD-FP16-NEXT:    ret
239;
240; CHECK-GI-NOFP16-LABEL: mul_2H:
241; CHECK-GI-NOFP16:       // %bb.0:
242; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
243; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
244; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
245; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
246; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v2.4s, v0.4s
247; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v1.4s
248; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
249; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
250; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
251; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
252; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
253; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
254; CHECK-GI-NOFP16-NEXT:    ret
255;
256; CHECK-GI-FP16-LABEL: mul_2H:
257; CHECK-GI-FP16:       // %bb.0:
258; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
259; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
260; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
261; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
262; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
263; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
264; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
265; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
266; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
267; CHECK-GI-FP16-NEXT:    ret
268  %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
269  ret half %r
270}
271
272define float @mul_2S(<8 x float> %bin.rdx)  {
273; CHECK-SD-LABEL: mul_2S:
274; CHECK-SD:       // %bb.0:
275; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
276; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
277; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
278; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
279; CHECK-SD-NEXT:    ret
280;
281; CHECK-GI-LABEL: mul_2S:
282; CHECK-GI:       // %bb.0:
283; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
284; CHECK-GI-NEXT:    mov d1, v0.d[1]
285; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
286; CHECK-GI-NEXT:    mov s1, v0.s[1]
287; CHECK-GI-NEXT:    fmul s0, s0, s1
288; CHECK-GI-NEXT:    ret
289  %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
290  ret float %r
291}
292
293define double @mul_2D(<4 x double> %bin.rdx)  {
294; CHECK-LABEL: mul_2D:
295; CHECK:       // %bb.0:
296; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
297; CHECK-NEXT:    fmul d0, d0, v0.d[1]
298; CHECK-NEXT:    ret
299  %r = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
300  ret double %r
301}
302
303; added at least one test where the start value is not 1.0.
304define float @mul_S_init_42(<4 x float> %bin.rdx)  {
305; CHECK-SD-LABEL: mul_S_init_42:
306; CHECK-SD:       // %bb.0:
307; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
308; CHECK-SD-NEXT:    mov w8, #1109917696 // =0x42280000
309; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
310; CHECK-SD-NEXT:    fmov s1, w8
311; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
312; CHECK-SD-NEXT:    fmul s0, s0, s1
313; CHECK-SD-NEXT:    ret
314;
315; CHECK-GI-LABEL: mul_S_init_42:
316; CHECK-GI:       // %bb.0:
317; CHECK-GI-NEXT:    mov d1, v0.d[1]
318; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
319; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
320; CHECK-GI-NEXT:    mov s1, v0.s[1]
321; CHECK-GI-NEXT:    fmul s0, s0, s1
322; CHECK-GI-NEXT:    fmov s1, w8
323; CHECK-GI-NEXT:    fmul s0, s0, s1
324; CHECK-GI-NEXT:    ret
325  %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
326  ret float %r
327}
328
329
330define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
331; CHECK-SD-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
332; CHECK-SD-NOFP16:       // %bb.0:
333; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
334; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
335; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
336; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
337; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
338; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
339; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
340; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
341; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
342; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
343; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
344; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
345; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
346; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
347; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
348; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
349; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
350; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
351; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
352; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
353; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
354; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
355; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
356; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
357; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
358; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
359; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
360; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
361; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
362; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
363; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
364; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
365; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
366; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
367; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
368; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
369; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
370; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
371; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
372; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
373; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
374; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
375; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
376; CHECK-SD-NOFP16-NEXT:    ret
377;
378; CHECK-SD-FP16-LABEL: fmul_reduct_reassoc_v8f16:
379; CHECK-SD-FP16:       // %bb.0:
380; CHECK-SD-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
381; CHECK-SD-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
382; CHECK-SD-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
383; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
384; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
385; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
386; CHECK-SD-FP16-NEXT:    ret
387;
388; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
389; CHECK-GI-NOFP16:       // %bb.0:
390; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
391; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
392; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
393; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
394; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v2.4s, v0.4s
395; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v1.4s
396; CHECK-GI-NOFP16-NEXT:    mov d2, v0.d[1]
397; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
398; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
399; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
400; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
401; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
402; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
403; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
404; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
405; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
406; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
407; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
408; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
409; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
410; CHECK-GI-NOFP16-NEXT:    ret
411;
412; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16:
413; CHECK-GI-FP16:       // %bb.0:
414; CHECK-GI-FP16-NEXT:    mov d2, v0.d[1]
415; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
416; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
417; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
418; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
419; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
420; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
421; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
422; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
423; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
424; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
425; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
426; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
427; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
428; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
429; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
430; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
431; CHECK-GI-FP16-NEXT:    ret
432  %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
433  %r2 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %b)
434  %r = fmul fast half %r1, %r2
435  ret half %r
436}
437
438define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
439; CHECK-SD-LABEL: fmul_reduct_reassoc_v8f32:
440; CHECK-SD:       // %bb.0:
441; CHECK-SD-NEXT:    fmul v2.4s, v2.4s, v3.4s
442; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
443; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v2.4s
444; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
445; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
446; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
447; CHECK-SD-NEXT:    ret
448;
449; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32:
450; CHECK-GI:       // %bb.0:
451; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
452; CHECK-GI-NEXT:    fmul v1.4s, v2.4s, v3.4s
453; CHECK-GI-NEXT:    mov d2, v0.d[1]
454; CHECK-GI-NEXT:    mov d3, v1.d[1]
455; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
456; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
457; CHECK-GI-NEXT:    mov s2, v0.s[1]
458; CHECK-GI-NEXT:    mov s3, v1.s[1]
459; CHECK-GI-NEXT:    fmul s0, s0, s2
460; CHECK-GI-NEXT:    fmul s1, s1, s3
461; CHECK-GI-NEXT:    fmul s0, s0, s1
462; CHECK-GI-NEXT:    ret
463  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
464  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
465  %r = fmul fast float %r1, %r2
466  ret float %r
467}
468
469define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
470; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32:
471; CHECK-SD:       // %bb.0:
472; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
473; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
474; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
475; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
476; CHECK-SD-NEXT:    ret
477;
478; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32:
479; CHECK-GI:       // %bb.0:
480; CHECK-GI-NEXT:    mov d2, v0.d[1]
481; CHECK-GI-NEXT:    mov d3, v1.d[1]
482; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
483; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
484; CHECK-GI-NEXT:    mov s2, v0.s[1]
485; CHECK-GI-NEXT:    mov s3, v1.s[1]
486; CHECK-GI-NEXT:    fmul s0, s0, s2
487; CHECK-GI-NEXT:    fmul s1, s1, s3
488; CHECK-GI-NEXT:    fmul s0, s0, s1
489; CHECK-GI-NEXT:    ret
490  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
491  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
492  %r = fmul fast float %r1, %r2
493  ret float %r
494}
495
496define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
497; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init:
498; CHECK-SD:       // %bb.0:
499; CHECK-SD-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
500; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v3.2s
501; CHECK-SD-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
502; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
503; CHECK-SD-NEXT:    fmul v2.2s, v2.2s, v3.2s
504; CHECK-SD-NEXT:    fmul s0, s0, s1
505; CHECK-SD-NEXT:    fmul s1, s2, v2.s[1]
506; CHECK-SD-NEXT:    fmul s0, s0, s1
507; CHECK-SD-NEXT:    ret
508;
509; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init:
510; CHECK-GI:       // %bb.0:
511; CHECK-GI-NEXT:    mov d3, v1.d[1]
512; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
513; CHECK-GI-NEXT:    mov d3, v2.d[1]
514; CHECK-GI-NEXT:    mov s4, v1.s[1]
515; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
516; CHECK-GI-NEXT:    fmul s1, s1, s4
517; CHECK-GI-NEXT:    mov s3, v2.s[1]
518; CHECK-GI-NEXT:    fmul s0, s0, s1
519; CHECK-GI-NEXT:    fmul s1, s2, s3
520; CHECK-GI-NEXT:    fmul s0, s0, s1
521; CHECK-GI-NEXT:    ret
522  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
523  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
524  %r = fmul fast float %r1, %r2
525  ret float %r
526}
527
528define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
529; CHECK-SD-LABEL: fmul_reduct_reassoc_v4v8f32:
530; CHECK-SD:       // %bb.0:
531; CHECK-SD-NEXT:    fmul v1.4s, v1.4s, v2.4s
532; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
533; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
534; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
535; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
536; CHECK-SD-NEXT:    ret
537;
538; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32:
539; CHECK-GI:       // %bb.0:
540; CHECK-GI-NEXT:    fmul v1.4s, v1.4s, v2.4s
541; CHECK-GI-NEXT:    mov d2, v0.d[1]
542; CHECK-GI-NEXT:    mov d3, v1.d[1]
543; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
544; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
545; CHECK-GI-NEXT:    mov s2, v0.s[1]
546; CHECK-GI-NEXT:    mov s3, v1.s[1]
547; CHECK-GI-NEXT:    fmul s0, s0, s2
548; CHECK-GI-NEXT:    fmul s1, s1, s3
549; CHECK-GI-NEXT:    fmul s0, s0, s1
550; CHECK-GI-NEXT:    ret
551  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
552  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b)
553  %r = fmul fast float %r1, %r2
554  ret float %r
555}
556
557define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
558; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f64:
559; CHECK-SD:       // %bb.0:
560; CHECK-SD-NEXT:    fmul v2.2d, v2.2d, v3.2d
561; CHECK-SD-NEXT:    fmul v0.2d, v0.2d, v1.2d
562; CHECK-SD-NEXT:    fmul v0.2d, v0.2d, v2.2d
563; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
564; CHECK-SD-NEXT:    ret
565;
566; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f64:
567; CHECK-GI:       // %bb.0:
568; CHECK-GI-NEXT:    fmul v0.2d, v0.2d, v1.2d
569; CHECK-GI-NEXT:    fmul v1.2d, v2.2d, v3.2d
570; CHECK-GI-NEXT:    fmul d0, d0, v0.d[1]
571; CHECK-GI-NEXT:    fmul d1, d1, v1.d[1]
572; CHECK-GI-NEXT:    fmul d0, d0, d1
573; CHECK-GI-NEXT:    ret
574  %r1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a)
575  %r2 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %b)
576  %r = fmul fast double %r1, %r2
577  ret double %r
578}
579
580define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
581; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause:
582; CHECK-SD:       // %bb.0:
583; CHECK-SD-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
584; CHECK-SD-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
585; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v2.2s
586; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v3.2s
587; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
588; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
589; CHECK-SD-NEXT:    fmul s1, s0, s1
590; CHECK-SD-NEXT:    fmul s0, s1, s0
591; CHECK-SD-NEXT:    ret
592;
593; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause:
594; CHECK-GI:       // %bb.0:
595; CHECK-GI-NEXT:    mov d2, v0.d[1]
596; CHECK-GI-NEXT:    mov d3, v1.d[1]
597; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
598; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
599; CHECK-GI-NEXT:    mov s2, v0.s[1]
600; CHECK-GI-NEXT:    mov s3, v1.s[1]
601; CHECK-GI-NEXT:    fmul s0, s0, s2
602; CHECK-GI-NEXT:    fmul s1, s1, s3
603; CHECK-GI-NEXT:    fmul s1, s0, s1
604; CHECK-GI-NEXT:    fmul s0, s1, s0
605; CHECK-GI-NEXT:    ret
606  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
607  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
608  %r = fmul fast float %r1, %r2
609  %p = fmul float %r, %r1
610  ret float %p
611}
612
613; Function Attrs: nounwind readnone
614declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
615declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)
616declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
617declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
618declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
619declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
620declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
621declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
622