xref: /llvm-project/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll (revision edc1c3d24e6f8ed548340ce0369138fb40427a24)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
3; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
4; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
5; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
6
7define float @add_HalfS(<2 x float> %bin.rdx)  {
8; CHECK-LABEL: add_HalfS:
9; CHECK:       // %bb.0:
10; CHECK-NEXT:    faddp s0, v0.2s
11; CHECK-NEXT:    ret
12  %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
13  ret float %r
14}
15
16define half @add_HalfH(<4 x half> %bin.rdx)  {
17; CHECK-SD-NOFP16-LABEL: add_HalfH:
18; CHECK-SD-NOFP16:       // %bb.0:
19; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
20; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
21; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
22; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
23; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
24; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
25; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
26; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
27; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
28; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
29; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
30; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
31; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
32; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
33; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
34; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
35; CHECK-SD-NOFP16-NEXT:    ret
36;
37; CHECK-SD-FP16-LABEL: add_HalfH:
38; CHECK-SD-FP16:       // %bb.0:
39; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
40; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
41; CHECK-SD-FP16-NEXT:    ret
42;
43; CHECK-GI-NOFP16-LABEL: add_HalfH:
44; CHECK-GI-NOFP16:       // %bb.0:
45; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
46; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
47; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
48; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
49; CHECK-GI-NOFP16-NEXT:    ret
50;
51; CHECK-GI-FP16-LABEL: add_HalfH:
52; CHECK-GI-FP16:       // %bb.0:
53; CHECK-GI-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
54; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
55; CHECK-GI-FP16-NEXT:    ret
56  %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
57  ret half %r
58}
59
60
61define half @add_H(<8 x half> %bin.rdx)  {
62; CHECK-SD-NOFP16-LABEL: add_H:
63; CHECK-SD-NOFP16:       // %bb.0:
64; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
65; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
66; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
67; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
68; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
69; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
70; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
71; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
72; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
73; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
74; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
75; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
76; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
77; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
78; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
79; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
80; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
81; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
82; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
83; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
84; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
85; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
86; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
87; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
88; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
89; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
90; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
91; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
92; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
93; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
94; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
95; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
96; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
97; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
98; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
99; CHECK-SD-NOFP16-NEXT:    ret
100;
101; CHECK-SD-FP16-LABEL: add_H:
102; CHECK-SD-FP16:       // %bb.0:
103; CHECK-SD-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
104; CHECK-SD-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
105; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
106; CHECK-SD-FP16-NEXT:    ret
107;
108; CHECK-GI-NOFP16-LABEL: add_H:
109; CHECK-GI-NOFP16:       // %bb.0:
110; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
111; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
112; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v1.4s, v0.4s
113; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
114; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
115; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
116; CHECK-GI-NOFP16-NEXT:    ret
117;
118; CHECK-GI-FP16-LABEL: add_H:
119; CHECK-GI-FP16:       // %bb.0:
120; CHECK-GI-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
121; CHECK-GI-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
122; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
123; CHECK-GI-FP16-NEXT:    ret
124  %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
125  ret half %r
126}
127
128define float @add_S(<4 x float> %bin.rdx)  {
129; CHECK-LABEL: add_S:
130; CHECK:       // %bb.0:
131; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
132; CHECK-NEXT:    faddp s0, v0.2s
133; CHECK-NEXT:    ret
134  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
135  ret float %r
136}
137
138define double @add_D(<2 x double> %bin.rdx)  {
139; CHECK-LABEL: add_D:
140; CHECK:       // %bb.0:
141; CHECK-NEXT:    faddp d0, v0.2d
142; CHECK-NEXT:    ret
143  %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
144  ret double %r
145}
146
147define half @add_2H(<16 x half> %bin.rdx)  {
148; CHECK-SD-NOFP16-LABEL: add_2H:
149; CHECK-SD-NOFP16:       // %bb.0:
150; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
151; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
152; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
153; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
154; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
155; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
156; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
157; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
158; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
159; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
160; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
161; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
162; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
163; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
164; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
165; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
166; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
167; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
168; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
169; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
170; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
171; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
172; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
173; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
174; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
175; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
176; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
177; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
178; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
179; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
180; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
181; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
182; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
183; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
184; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
185; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
186; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
187; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
188; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
189; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
190; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
191; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
192; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
193; CHECK-SD-NOFP16-NEXT:    ret
194;
195; CHECK-SD-FP16-LABEL: add_2H:
196; CHECK-SD-FP16:       // %bb.0:
197; CHECK-SD-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
198; CHECK-SD-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
199; CHECK-SD-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
200; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
201; CHECK-SD-FP16-NEXT:    ret
202;
203; CHECK-GI-NOFP16-LABEL: add_2H:
204; CHECK-GI-NOFP16:       // %bb.0:
205; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
206; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
207; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
208; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
209; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v2.4s, v0.4s
210; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v3.4s, v1.4s
211; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
212; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
213; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
214; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
215; CHECK-GI-NOFP16-NEXT:    ret
216;
217; CHECK-GI-FP16-LABEL: add_2H:
218; CHECK-GI-FP16:       // %bb.0:
219; CHECK-GI-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
220; CHECK-GI-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
221; CHECK-GI-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
222; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
223; CHECK-GI-FP16-NEXT:    ret
224  %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
225  ret half %r
226}
227
228define float @add_2S(<8 x float> %bin.rdx)  {
229; CHECK-LABEL: add_2S:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
232; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
233; CHECK-NEXT:    faddp s0, v0.2s
234; CHECK-NEXT:    ret
235  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
236  ret float %r
237}
238
239define double @add_2D(<4 x double> %bin.rdx)  {
240; CHECK-LABEL: add_2D:
241; CHECK:       // %bb.0:
242; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
243; CHECK-NEXT:    faddp d0, v0.2d
244; CHECK-NEXT:    ret
245  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
246  ret double %r
247}
248
249; Added at least one test where the start value is not -0.0.
250define float @add_S_init_42(<4 x float> %bin.rdx)  {
251; CHECK-LABEL: add_S_init_42:
252; CHECK:       // %bb.0:
253; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
254; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
255; CHECK-NEXT:    fmov s1, w8
256; CHECK-NEXT:    faddp s0, v0.2s
257; CHECK-NEXT:    fadd s0, s0, s1
258; CHECK-NEXT:    ret
259  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
260  ret float %r
261}
262
263; The faddp.4s in the loop should not use v0.4s as second operand,
264; because this introduces an unnecessary cross-iteration dependency.
265define float @fadd_reduction_v4f32_in_loop(ptr %ptr.start) {
266; CHECK-LABEL: fadd_reduction_v4f32_in_loop:
267; CHECK:       // %bb.0: // %entry
268; CHECK-NEXT:    movi d0, #0000000000000000
269; CHECK-NEXT:    mov x8, xzr
270; CHECK-NEXT:  .LBB9_1: // %loop
271; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
272; CHECK-NEXT:    ldr q1, [x0, x8]
273; CHECK-NEXT:    add x8, x8, #16
274; CHECK-NEXT:    cmp w8, #112
275; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
276; CHECK-NEXT:    faddp s1, v1.2s
277; CHECK-NEXT:    fadd s0, s1, s0
278; CHECK-NEXT:    b.ne .LBB9_1
279; CHECK-NEXT:  // %bb.2: // %exit
280; CHECK-NEXT:    ret
281entry:
282  br label %loop
283
284loop:
285  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
286  %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
287  %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ]
288  %lv = load <4 x float>, ptr %ptr, align 4
289  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %lv)
290  %red.next = fadd fast float %r, %red
291  %ec = icmp eq i32 %iv, 7
292  %ptr.next = getelementptr inbounds float, ptr %ptr, i64 4
293  %iv.next= add nuw nsw i32 %iv, 1
294  br i1 %ec, label %exit, label %loop
295
296exit:
297  ret float %red.next
298}
299
300; The faddp.4h in the loop should not use v0.4h as second operand,
301; because this introduces an unnecessary cross-iteration dependency.
302define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) {
303; CHECK-SD-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
304; CHECK-SD-NOFP16:       // %bb.0: // %entry
305; CHECK-SD-NOFP16-NEXT:    movi d0, #0000000000000000
306; CHECK-SD-NOFP16-NEXT:    mov x8, xzr
307; CHECK-SD-NOFP16-NEXT:  .LBB10_1: // %loop
308; CHECK-SD-NOFP16-NEXT:    // =>This Inner Loop Header: Depth=1
309; CHECK-SD-NOFP16-NEXT:    ldr d1, [x0, x8]
310; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
311; CHECK-SD-NOFP16-NEXT:    add x8, x8, #8
312; CHECK-SD-NOFP16-NEXT:    cmp w8, #56
313; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
314; CHECK-SD-NOFP16-NEXT:    fcvt s3, h1
315; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
316; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
317; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[2]
318; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
319; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
320; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
321; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
322; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
323; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
324; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
325; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
326; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
327; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
328; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
329; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
330; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
331; CHECK-SD-NOFP16-NEXT:    b.ne .LBB10_1
332; CHECK-SD-NOFP16-NEXT:  // %bb.2: // %exit
333; CHECK-SD-NOFP16-NEXT:    ret
334;
335; CHECK-SD-FP16-LABEL: fadd_reduction_v4f16_in_loop:
336; CHECK-SD-FP16:       // %bb.0: // %entry
337; CHECK-SD-FP16-NEXT:    movi d0, #0000000000000000
338; CHECK-SD-FP16-NEXT:    mov x8, xzr
339; CHECK-SD-FP16-NEXT:  .LBB10_1: // %loop
340; CHECK-SD-FP16-NEXT:    // =>This Inner Loop Header: Depth=1
341; CHECK-SD-FP16-NEXT:    ldr d1, [x0, x8]
342; CHECK-SD-FP16-NEXT:    add x8, x8, #8
343; CHECK-SD-FP16-NEXT:    cmp w8, #56
344; CHECK-SD-FP16-NEXT:    faddp v1.4h, v1.4h, v1.4h
345; CHECK-SD-FP16-NEXT:    faddp h1, v1.2h
346; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
347; CHECK-SD-FP16-NEXT:    b.ne .LBB10_1
348; CHECK-SD-FP16-NEXT:  // %bb.2: // %exit
349; CHECK-SD-FP16-NEXT:    ret
350;
351; CHECK-GI-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
352; CHECK-GI-NOFP16:       // %bb.0: // %entry
353; CHECK-GI-NOFP16-NEXT:    mov x8, xzr
354; CHECK-GI-NOFP16-NEXT:    mov w9, #0 // =0x0
355; CHECK-GI-NOFP16-NEXT:  .LBB10_1: // %loop
356; CHECK-GI-NOFP16-NEXT:    // =>This Inner Loop Header: Depth=1
357; CHECK-GI-NOFP16-NEXT:    ldr d0, [x0, x8]
358; CHECK-GI-NOFP16-NEXT:    fmov s1, w9
359; CHECK-GI-NOFP16-NEXT:    add x8, x8, #8
360; CHECK-GI-NOFP16-NEXT:    cmp w8, #56
361; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
362; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
363; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
364; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
365; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
366; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
367; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
368; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
369; CHECK-GI-NOFP16-NEXT:    fmov w9, s0
370; CHECK-GI-NOFP16-NEXT:    b.ne .LBB10_1
371; CHECK-GI-NOFP16-NEXT:  // %bb.2: // %exit
372; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
373; CHECK-GI-NOFP16-NEXT:    ret
374;
375; CHECK-GI-FP16-LABEL: fadd_reduction_v4f16_in_loop:
376; CHECK-GI-FP16:       // %bb.0: // %entry
377; CHECK-GI-FP16-NEXT:    movi d0, #0000000000000000
378; CHECK-GI-FP16-NEXT:    mov x8, xzr
379; CHECK-GI-FP16-NEXT:  .LBB10_1: // %loop
380; CHECK-GI-FP16-NEXT:    // =>This Inner Loop Header: Depth=1
381; CHECK-GI-FP16-NEXT:    ldr d1, [x0, x8]
382; CHECK-GI-FP16-NEXT:    add x8, x8, #8
383; CHECK-GI-FP16-NEXT:    cmp w8, #56
384; CHECK-GI-FP16-NEXT:    faddp v1.4h, v1.4h, v1.4h
385; CHECK-GI-FP16-NEXT:    faddp h1, v1.2h
386; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
387; CHECK-GI-FP16-NEXT:    b.ne .LBB10_1
388; CHECK-GI-FP16-NEXT:  // %bb.2: // %exit
389; CHECK-GI-FP16-NEXT:    ret
390entry:
391  br label %loop
392
393loop:
394  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
395  %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
396  %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
397  %lv = load <4 x half>, ptr %ptr, align 4
398  %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %lv)
399  %red.next = fadd fast half %r, %red
400  %ec = icmp eq i32 %iv, 7
401  %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
402  %iv.next= add nuw nsw i32 %iv, 1
403  br i1 %ec, label %exit, label %loop
404
405exit:
406  ret half %red.next
407}
408
409; The faddp.8h in the loop should not use v0.8h as second operand,
410; because this introduces an unnecessary cross-iteration dependency.
411define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) {
412; CHECK-SD-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
413; CHECK-SD-NOFP16:       // %bb.0: // %entry
414; CHECK-SD-NOFP16-NEXT:    movi d0, #0000000000000000
415; CHECK-SD-NOFP16-NEXT:    mov x8, xzr
416; CHECK-SD-NOFP16-NEXT:  .LBB11_1: // %loop
417; CHECK-SD-NOFP16-NEXT:    // =>This Inner Loop Header: Depth=1
418; CHECK-SD-NOFP16-NEXT:    ldr q1, [x0, x8]
419; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
420; CHECK-SD-NOFP16-NEXT:    add x8, x8, #8
421; CHECK-SD-NOFP16-NEXT:    cmp w8, #56
422; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
423; CHECK-SD-NOFP16-NEXT:    fcvt s3, h1
424; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
425; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
426; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[2]
427; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
428; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
429; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
430; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
431; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[3]
432; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
433; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
434; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
435; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
436; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[4]
437; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
438; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
439; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
440; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
441; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[5]
442; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
443; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
444; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
445; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
446; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[6]
447; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
448; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
449; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
450; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
451; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
452; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
453; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
454; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
455; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
456; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
457; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
458; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
459; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
460; CHECK-SD-NOFP16-NEXT:    b.ne .LBB11_1
461; CHECK-SD-NOFP16-NEXT:  // %bb.2: // %exit
462; CHECK-SD-NOFP16-NEXT:    ret
463;
464; CHECK-SD-FP16-LABEL: fadd_reduction_v8f16_in_loop:
465; CHECK-SD-FP16:       // %bb.0: // %entry
466; CHECK-SD-FP16-NEXT:    movi d0, #0000000000000000
467; CHECK-SD-FP16-NEXT:    mov x8, xzr
468; CHECK-SD-FP16-NEXT:  .LBB11_1: // %loop
469; CHECK-SD-FP16-NEXT:    // =>This Inner Loop Header: Depth=1
470; CHECK-SD-FP16-NEXT:    ldr q1, [x0, x8]
471; CHECK-SD-FP16-NEXT:    add x8, x8, #8
472; CHECK-SD-FP16-NEXT:    cmp w8, #56
473; CHECK-SD-FP16-NEXT:    faddp v2.8h, v1.8h, v1.8h
474; CHECK-SD-FP16-NEXT:    faddp v1.8h, v2.8h, v1.8h
475; CHECK-SD-FP16-NEXT:    faddp h1, v1.2h
476; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
477; CHECK-SD-FP16-NEXT:    b.ne .LBB11_1
478; CHECK-SD-FP16-NEXT:  // %bb.2: // %exit
479; CHECK-SD-FP16-NEXT:    ret
480;
481; CHECK-GI-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
482; CHECK-GI-NOFP16:       // %bb.0: // %entry
483; CHECK-GI-NOFP16-NEXT:    mov x8, xzr
484; CHECK-GI-NOFP16-NEXT:    mov w9, #0 // =0x0
485; CHECK-GI-NOFP16-NEXT:  .LBB11_1: // %loop
486; CHECK-GI-NOFP16-NEXT:    // =>This Inner Loop Header: Depth=1
487; CHECK-GI-NOFP16-NEXT:    ldr q0, [x0, x8]
488; CHECK-GI-NOFP16-NEXT:    add x8, x8, #8
489; CHECK-GI-NOFP16-NEXT:    cmp w8, #56
490; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
491; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
492; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v1.4s, v0.4s
493; CHECK-GI-NOFP16-NEXT:    fmov s1, w9
494; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
495; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
496; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
497; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
498; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
499; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
500; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
501; CHECK-GI-NOFP16-NEXT:    fmov w9, s0
502; CHECK-GI-NOFP16-NEXT:    b.ne .LBB11_1
503; CHECK-GI-NOFP16-NEXT:  // %bb.2: // %exit
504; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
505; CHECK-GI-NOFP16-NEXT:    ret
506;
507; CHECK-GI-FP16-LABEL: fadd_reduction_v8f16_in_loop:
508; CHECK-GI-FP16:       // %bb.0: // %entry
509; CHECK-GI-FP16-NEXT:    movi d0, #0000000000000000
510; CHECK-GI-FP16-NEXT:    mov x8, xzr
511; CHECK-GI-FP16-NEXT:  .LBB11_1: // %loop
512; CHECK-GI-FP16-NEXT:    // =>This Inner Loop Header: Depth=1
513; CHECK-GI-FP16-NEXT:    ldr q1, [x0, x8]
514; CHECK-GI-FP16-NEXT:    add x8, x8, #8
515; CHECK-GI-FP16-NEXT:    cmp w8, #56
516; CHECK-GI-FP16-NEXT:    faddp v2.8h, v1.8h, v1.8h
517; CHECK-GI-FP16-NEXT:    faddp v1.8h, v2.8h, v1.8h
518; CHECK-GI-FP16-NEXT:    faddp h1, v1.2h
519; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
520; CHECK-GI-FP16-NEXT:    b.ne .LBB11_1
521; CHECK-GI-FP16-NEXT:  // %bb.2: // %exit
522; CHECK-GI-FP16-NEXT:    ret
523entry:
524  br label %loop
525
526loop:
527  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
528  %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
529  %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
530  %lv = load <8 x half>, ptr %ptr, align 4
531  %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %lv)
532  %red.next = fadd fast half %r, %red
533  %ec = icmp eq i32 %iv, 7
534  %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
535  %iv.next= add nuw nsw i32 %iv, 1
536  br i1 %ec, label %exit, label %loop
537
538exit:
539  ret half %red.next
540}
541
542
543define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
544; CHECK-SD-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
545; CHECK-SD-NOFP16:       // %bb.0:
546; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
547; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
548; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
549; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
550; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
551; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
552; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
553; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
554; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
555; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
556; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
557; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
558; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
559; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
560; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
561; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
562; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
563; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
564; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
565; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
566; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
567; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
568; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
569; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
570; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
571; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
572; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
573; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
574; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
575; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
576; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
577; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
578; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
579; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
580; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
581; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
582; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
583; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
584; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
585; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
586; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
587; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
588; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
589; CHECK-SD-NOFP16-NEXT:    ret
590;
591; CHECK-SD-FP16-LABEL: fadd_reduct_reassoc_v8f16:
592; CHECK-SD-FP16:       // %bb.0:
593; CHECK-SD-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
594; CHECK-SD-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
595; CHECK-SD-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
596; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
597; CHECK-SD-FP16-NEXT:    ret
598;
599; CHECK-GI-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
600; CHECK-GI-NOFP16:       // %bb.0:
601; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
602; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
603; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
604; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
605; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v2.4s, v0.4s
606; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v3.4s, v1.4s
607; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
608; CHECK-GI-NOFP16-NEXT:    faddp v1.4s, v1.4s, v1.4s
609; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
610; CHECK-GI-NOFP16-NEXT:    faddp s1, v1.2s
611; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
612; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
613; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
614; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
615; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
616; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
617; CHECK-GI-NOFP16-NEXT:    ret
618;
619; CHECK-GI-FP16-LABEL: fadd_reduct_reassoc_v8f16:
620; CHECK-GI-FP16:       // %bb.0:
621; CHECK-GI-FP16-NEXT:    faddp v2.8h, v0.8h, v0.8h
622; CHECK-GI-FP16-NEXT:    faddp v3.8h, v1.8h, v1.8h
623; CHECK-GI-FP16-NEXT:    faddp v0.8h, v2.8h, v0.8h
624; CHECK-GI-FP16-NEXT:    faddp v1.8h, v3.8h, v1.8h
625; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
626; CHECK-GI-FP16-NEXT:    faddp h1, v1.2h
627; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
628; CHECK-GI-FP16-NEXT:    ret
629  %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a)
630  %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b)
631  %r = fadd fast half %r1, %r2
632  ret half %r
633}
634
635define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
636; CHECK-SD-LABEL: fadd_reduct_reassoc_v8f32:
637; CHECK-SD:       // %bb.0:
638; CHECK-SD-NEXT:    fadd v2.4s, v2.4s, v3.4s
639; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
640; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v2.4s
641; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
642; CHECK-SD-NEXT:    faddp s0, v0.2s
643; CHECK-SD-NEXT:    ret
644;
645; CHECK-GI-LABEL: fadd_reduct_reassoc_v8f32:
646; CHECK-GI:       // %bb.0:
647; CHECK-GI-NEXT:    fadd v0.4s, v0.4s, v1.4s
648; CHECK-GI-NEXT:    fadd v1.4s, v2.4s, v3.4s
649; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
650; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
651; CHECK-GI-NEXT:    faddp s0, v0.2s
652; CHECK-GI-NEXT:    faddp s1, v1.2s
653; CHECK-GI-NEXT:    fadd s0, s0, s1
654; CHECK-GI-NEXT:    ret
655  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
656  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
657  %r = fadd fast float %r1, %r2
658  ret float %r
659}
660
661define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
662; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f32:
663; CHECK-SD:       // %bb.0:
664; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
665; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
666; CHECK-SD-NEXT:    faddp s0, v0.2s
667; CHECK-SD-NEXT:    ret
668;
669; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f32:
670; CHECK-GI:       // %bb.0:
671; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
672; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
673; CHECK-GI-NEXT:    faddp s0, v0.2s
674; CHECK-GI-NEXT:    faddp s1, v1.2s
675; CHECK-GI-NEXT:    fadd s0, s0, s1
676; CHECK-GI-NEXT:    ret
677  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
678  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
679  %r = fadd fast float %r1, %r2
680  ret float %r
681}
682
683define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
684; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init:
685; CHECK:       // %bb.0:
686; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
687; CHECK-NEXT:    faddp v2.4s, v2.4s, v2.4s
688; CHECK-NEXT:    faddp s1, v1.2s
689; CHECK-NEXT:    fadd s0, s0, s1
690; CHECK-NEXT:    faddp s1, v2.2s
691; CHECK-NEXT:    fadd s0, s0, s1
692; CHECK-NEXT:    ret
693  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a)
694  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
695  %r = fadd fast float %r1, %r2
696  ret float %r
697}
698
699define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
700; CHECK-SD-LABEL: fadd_reduct_reassoc_v4v8f32:
701; CHECK-SD:       // %bb.0:
702; CHECK-SD-NEXT:    fadd v1.4s, v1.4s, v2.4s
703; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
704; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
705; CHECK-SD-NEXT:    faddp s0, v0.2s
706; CHECK-SD-NEXT:    ret
707;
708; CHECK-GI-LABEL: fadd_reduct_reassoc_v4v8f32:
709; CHECK-GI:       // %bb.0:
710; CHECK-GI-NEXT:    fadd v1.4s, v1.4s, v2.4s
711; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
712; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
713; CHECK-GI-NEXT:    faddp s0, v0.2s
714; CHECK-GI-NEXT:    faddp s1, v1.2s
715; CHECK-GI-NEXT:    fadd s0, s0, s1
716; CHECK-GI-NEXT:    ret
717  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
718  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
719  %r = fadd fast float %r1, %r2
720  ret float %r
721}
722
723define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
724; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f64:
725; CHECK-SD:       // %bb.0:
726; CHECK-SD-NEXT:    fadd v2.2d, v2.2d, v3.2d
727; CHECK-SD-NEXT:    fadd v0.2d, v0.2d, v1.2d
728; CHECK-SD-NEXT:    fadd v0.2d, v0.2d, v2.2d
729; CHECK-SD-NEXT:    faddp d0, v0.2d
730; CHECK-SD-NEXT:    ret
731;
732; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f64:
733; CHECK-GI:       // %bb.0:
734; CHECK-GI-NEXT:    fadd v0.2d, v0.2d, v1.2d
735; CHECK-GI-NEXT:    fadd v1.2d, v2.2d, v3.2d
736; CHECK-GI-NEXT:    faddp d0, v0.2d
737; CHECK-GI-NEXT:    faddp d1, v1.2d
738; CHECK-GI-NEXT:    fadd d0, d0, d1
739; CHECK-GI-NEXT:    ret
740  %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a)
741  %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b)
742  %r = fadd fast double %r1, %r2
743  ret double %r
744}
745
746define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
747; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause:
748; CHECK:       // %bb.0:
749; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
750; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
751; CHECK-NEXT:    faddp s0, v0.2s
752; CHECK-NEXT:    faddp s1, v1.2s
753; CHECK-NEXT:    fadd s1, s0, s1
754; CHECK-NEXT:    fmul s0, s1, s0
755; CHECK-NEXT:    ret
756  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
757  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
758  %r = fadd fast float %r1, %r2
759  %p = fmul float %r, %r1
760  ret float %p
761}
762
763; Function Attrs: nounwind readnone
764declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
765declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
766declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
767declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
768declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
769declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
770declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
771declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
772