xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefix=AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10;
11; vXf32 (accum)
12;
13
14define float @test_v2f32(float %a0, <2 x float> %a1) {
15; SSE2-LABEL: test_v2f32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    movaps %xmm1, %xmm2
18; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
19; SSE2-NEXT:    addss %xmm1, %xmm2
20; SSE2-NEXT:    addss %xmm2, %xmm0
21; SSE2-NEXT:    retq
22;
23; SSE41-LABEL: test_v2f32:
24; SSE41:       # %bb.0:
25; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
26; SSE41-NEXT:    addss %xmm1, %xmm2
27; SSE41-NEXT:    addss %xmm2, %xmm0
28; SSE41-NEXT:    retq
29;
30; AVX1-SLOW-LABEL: test_v2f32:
31; AVX1-SLOW:       # %bb.0:
32; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
33; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
34; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
35; AVX1-SLOW-NEXT:    retq
36;
37; AVX1-FAST-LABEL: test_v2f32:
38; AVX1-FAST:       # %bb.0:
39; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
40; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
41; AVX1-FAST-NEXT:    retq
42;
43; AVX2-LABEL: test_v2f32:
44; AVX2:       # %bb.0:
45; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
46; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
47; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
48; AVX2-NEXT:    retq
49;
50; AVX512-LABEL: test_v2f32:
51; AVX512:       # %bb.0:
52; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
53; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
54; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
55; AVX512-NEXT:    retq
56  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
57  ret float %1
58}
59
60define float @test_v4f32(float %a0, <4 x float> %a1) {
61; SSE2-LABEL: test_v4f32:
62; SSE2:       # %bb.0:
63; SSE2-NEXT:    movaps %xmm1, %xmm2
64; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
65; SSE2-NEXT:    addps %xmm1, %xmm2
66; SSE2-NEXT:    movaps %xmm2, %xmm1
67; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
68; SSE2-NEXT:    addss %xmm2, %xmm1
69; SSE2-NEXT:    addss %xmm1, %xmm0
70; SSE2-NEXT:    retq
71;
72; SSE41-LABEL: test_v4f32:
73; SSE41:       # %bb.0:
74; SSE41-NEXT:    movaps %xmm1, %xmm2
75; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
76; SSE41-NEXT:    addps %xmm1, %xmm2
77; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
78; SSE41-NEXT:    addss %xmm2, %xmm1
79; SSE41-NEXT:    addss %xmm1, %xmm0
80; SSE41-NEXT:    retq
81;
82; AVX1-SLOW-LABEL: test_v4f32:
83; AVX1-SLOW:       # %bb.0:
84; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
85; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
86; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
87; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
88; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
89; AVX1-SLOW-NEXT:    retq
90;
91; AVX1-FAST-LABEL: test_v4f32:
92; AVX1-FAST:       # %bb.0:
93; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
94; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
95; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
96; AVX1-FAST-NEXT:    retq
97;
98; AVX2-LABEL: test_v4f32:
99; AVX2:       # %bb.0:
100; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
101; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
102; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
103; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
104; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
105; AVX2-NEXT:    retq
106;
107; AVX512-LABEL: test_v4f32:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
110; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
111; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
112; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
113; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
114; AVX512-NEXT:    retq
115  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
116  ret float %1
117}
118
119define float @test_v8f32(float %a0, <8 x float> %a1) {
120; SSE2-LABEL: test_v8f32:
121; SSE2:       # %bb.0:
122; SSE2-NEXT:    addps %xmm2, %xmm1
123; SSE2-NEXT:    movaps %xmm1, %xmm2
124; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
125; SSE2-NEXT:    addps %xmm1, %xmm2
126; SSE2-NEXT:    movaps %xmm2, %xmm1
127; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
128; SSE2-NEXT:    addss %xmm2, %xmm1
129; SSE2-NEXT:    addss %xmm1, %xmm0
130; SSE2-NEXT:    retq
131;
132; SSE41-LABEL: test_v8f32:
133; SSE41:       # %bb.0:
134; SSE41-NEXT:    addps %xmm2, %xmm1
135; SSE41-NEXT:    movaps %xmm1, %xmm2
136; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
137; SSE41-NEXT:    addps %xmm1, %xmm2
138; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
139; SSE41-NEXT:    addss %xmm2, %xmm1
140; SSE41-NEXT:    addss %xmm1, %xmm0
141; SSE41-NEXT:    retq
142;
143; AVX1-SLOW-LABEL: test_v8f32:
144; AVX1-SLOW:       # %bb.0:
145; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
146; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
147; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
148; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
149; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
150; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
151; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
152; AVX1-SLOW-NEXT:    vzeroupper
153; AVX1-SLOW-NEXT:    retq
154;
155; AVX1-FAST-LABEL: test_v8f32:
156; AVX1-FAST:       # %bb.0:
157; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
158; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
159; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
160; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
161; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
162; AVX1-FAST-NEXT:    vzeroupper
163; AVX1-FAST-NEXT:    retq
164;
165; AVX2-LABEL: test_v8f32:
166; AVX2:       # %bb.0:
167; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
168; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
169; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
170; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
171; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
172; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
173; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
174; AVX2-NEXT:    vzeroupper
175; AVX2-NEXT:    retq
176;
177; AVX512-LABEL: test_v8f32:
178; AVX512:       # %bb.0:
179; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
180; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
181; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
182; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
183; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
184; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
185; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
186; AVX512-NEXT:    vzeroupper
187; AVX512-NEXT:    retq
188  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
189  ret float %1
190}
191
192define float @test_v16f32(float %a0, <16 x float> %a1) {
193; SSE2-LABEL: test_v16f32:
194; SSE2:       # %bb.0:
195; SSE2-NEXT:    addps %xmm4, %xmm2
196; SSE2-NEXT:    addps %xmm3, %xmm1
197; SSE2-NEXT:    addps %xmm2, %xmm1
198; SSE2-NEXT:    movaps %xmm1, %xmm2
199; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
200; SSE2-NEXT:    addps %xmm1, %xmm2
201; SSE2-NEXT:    movaps %xmm2, %xmm1
202; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
203; SSE2-NEXT:    addss %xmm2, %xmm1
204; SSE2-NEXT:    addss %xmm1, %xmm0
205; SSE2-NEXT:    retq
206;
207; SSE41-LABEL: test_v16f32:
208; SSE41:       # %bb.0:
209; SSE41-NEXT:    addps %xmm4, %xmm2
210; SSE41-NEXT:    addps %xmm3, %xmm1
211; SSE41-NEXT:    addps %xmm2, %xmm1
212; SSE41-NEXT:    movaps %xmm1, %xmm2
213; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
214; SSE41-NEXT:    addps %xmm1, %xmm2
215; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
216; SSE41-NEXT:    addss %xmm2, %xmm1
217; SSE41-NEXT:    addss %xmm1, %xmm0
218; SSE41-NEXT:    retq
219;
220; AVX1-SLOW-LABEL: test_v16f32:
221; AVX1-SLOW:       # %bb.0:
222; AVX1-SLOW-NEXT:    vaddps %ymm2, %ymm1, %ymm1
223; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
224; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
225; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
226; AVX1-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
227; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
228; AVX1-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
229; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
230; AVX1-SLOW-NEXT:    vzeroupper
231; AVX1-SLOW-NEXT:    retq
232;
233; AVX1-FAST-LABEL: test_v16f32:
234; AVX1-FAST:       # %bb.0:
235; AVX1-FAST-NEXT:    vaddps %ymm2, %ymm1, %ymm1
236; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
237; AVX1-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
238; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
239; AVX1-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
240; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
241; AVX1-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
242; AVX1-FAST-NEXT:    vzeroupper
243; AVX1-FAST-NEXT:    retq
244;
245; AVX2-LABEL: test_v16f32:
246; AVX2:       # %bb.0:
247; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
248; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
249; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
250; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
251; AVX2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
252; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
253; AVX2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
254; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
255; AVX2-NEXT:    vzeroupper
256; AVX2-NEXT:    retq
257;
258; AVX512-LABEL: test_v16f32:
259; AVX512:       # %bb.0:
260; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
261; AVX512-NEXT:    vaddps %zmm2, %zmm1, %zmm1
262; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
263; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
264; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
265; AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1
266; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
267; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
268; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
269; AVX512-NEXT:    vzeroupper
270; AVX512-NEXT:    retq
271  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
272  ret float %1
273}
274
275;
276; vXf32 (zero)
277;
278
279define float @test_v2f32_zero(<2 x float> %a0) {
280; SSE2-LABEL: test_v2f32_zero:
281; SSE2:       # %bb.0:
282; SSE2-NEXT:    movaps %xmm0, %xmm1
283; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
284; SSE2-NEXT:    addss %xmm1, %xmm0
285; SSE2-NEXT:    retq
286;
287; SSE41-LABEL: test_v2f32_zero:
288; SSE41:       # %bb.0:
289; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
290; SSE41-NEXT:    addss %xmm1, %xmm0
291; SSE41-NEXT:    retq
292;
293; AVX1-SLOW-LABEL: test_v2f32_zero:
294; AVX1-SLOW:       # %bb.0:
295; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
296; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
297; AVX1-SLOW-NEXT:    retq
298;
299; AVX1-FAST-LABEL: test_v2f32_zero:
300; AVX1-FAST:       # %bb.0:
301; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
302; AVX1-FAST-NEXT:    retq
303;
304; AVX2-LABEL: test_v2f32_zero:
305; AVX2:       # %bb.0:
306; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
307; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
308; AVX2-NEXT:    retq
309;
310; AVX512-LABEL: test_v2f32_zero:
311; AVX512:       # %bb.0:
312; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
313; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
314; AVX512-NEXT:    retq
315  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
316  ret float %1
317}
318
319define float @test_v4f32_zero(<4 x float> %a0) {
320; SSE2-LABEL: test_v4f32_zero:
321; SSE2:       # %bb.0:
322; SSE2-NEXT:    movaps %xmm0, %xmm1
323; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
324; SSE2-NEXT:    addps %xmm1, %xmm0
325; SSE2-NEXT:    movaps %xmm0, %xmm1
326; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
327; SSE2-NEXT:    addss %xmm1, %xmm0
328; SSE2-NEXT:    retq
329;
330; SSE41-LABEL: test_v4f32_zero:
331; SSE41:       # %bb.0:
332; SSE41-NEXT:    movaps %xmm0, %xmm1
333; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
334; SSE41-NEXT:    addps %xmm1, %xmm0
335; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
336; SSE41-NEXT:    addss %xmm1, %xmm0
337; SSE41-NEXT:    retq
338;
339; AVX1-SLOW-LABEL: test_v4f32_zero:
340; AVX1-SLOW:       # %bb.0:
341; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
342; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
343; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
344; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
345; AVX1-SLOW-NEXT:    retq
346;
347; AVX1-FAST-LABEL: test_v4f32_zero:
348; AVX1-FAST:       # %bb.0:
349; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
350; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
351; AVX1-FAST-NEXT:    retq
352;
353; AVX2-LABEL: test_v4f32_zero:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
356; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
357; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
358; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
359; AVX2-NEXT:    retq
360;
361; AVX512-LABEL: test_v4f32_zero:
362; AVX512:       # %bb.0:
363; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
364; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
365; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
366; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
367; AVX512-NEXT:    retq
368  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
369  ret float %1
370}
371
372define float @test_v8f32_zero(<8 x float> %a0) {
373; SSE2-LABEL: test_v8f32_zero:
374; SSE2:       # %bb.0:
375; SSE2-NEXT:    addps %xmm1, %xmm0
376; SSE2-NEXT:    movaps %xmm0, %xmm1
377; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
378; SSE2-NEXT:    addps %xmm1, %xmm0
379; SSE2-NEXT:    movaps %xmm0, %xmm1
380; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
381; SSE2-NEXT:    addss %xmm1, %xmm0
382; SSE2-NEXT:    retq
383;
384; SSE41-LABEL: test_v8f32_zero:
385; SSE41:       # %bb.0:
386; SSE41-NEXT:    addps %xmm1, %xmm0
387; SSE41-NEXT:    movaps %xmm0, %xmm1
388; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
389; SSE41-NEXT:    addps %xmm1, %xmm0
390; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
391; SSE41-NEXT:    addss %xmm1, %xmm0
392; SSE41-NEXT:    retq
393;
394; AVX1-SLOW-LABEL: test_v8f32_zero:
395; AVX1-SLOW:       # %bb.0:
396; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
397; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
398; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
399; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
400; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
401; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
402; AVX1-SLOW-NEXT:    vzeroupper
403; AVX1-SLOW-NEXT:    retq
404;
405; AVX1-FAST-LABEL: test_v8f32_zero:
406; AVX1-FAST:       # %bb.0:
407; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
408; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
409; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
410; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
411; AVX1-FAST-NEXT:    vzeroupper
412; AVX1-FAST-NEXT:    retq
413;
414; AVX2-LABEL: test_v8f32_zero:
415; AVX2:       # %bb.0:
416; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
417; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
418; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
419; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
420; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
421; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
422; AVX2-NEXT:    vzeroupper
423; AVX2-NEXT:    retq
424;
425; AVX512-LABEL: test_v8f32_zero:
426; AVX512:       # %bb.0:
427; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
428; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
429; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
430; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
431; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
432; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
433; AVX512-NEXT:    vzeroupper
434; AVX512-NEXT:    retq
435  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
436  ret float %1
437}
438
439define float @test_v16f32_zero(<16 x float> %a0) {
440; SSE2-LABEL: test_v16f32_zero:
441; SSE2:       # %bb.0:
442; SSE2-NEXT:    addps %xmm3, %xmm1
443; SSE2-NEXT:    addps %xmm2, %xmm0
444; SSE2-NEXT:    addps %xmm1, %xmm0
445; SSE2-NEXT:    movaps %xmm0, %xmm1
446; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
447; SSE2-NEXT:    addps %xmm1, %xmm0
448; SSE2-NEXT:    movaps %xmm0, %xmm1
449; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
450; SSE2-NEXT:    addss %xmm1, %xmm0
451; SSE2-NEXT:    retq
452;
453; SSE41-LABEL: test_v16f32_zero:
454; SSE41:       # %bb.0:
455; SSE41-NEXT:    addps %xmm3, %xmm1
456; SSE41-NEXT:    addps %xmm2, %xmm0
457; SSE41-NEXT:    addps %xmm1, %xmm0
458; SSE41-NEXT:    movaps %xmm0, %xmm1
459; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
460; SSE41-NEXT:    addps %xmm1, %xmm0
461; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
462; SSE41-NEXT:    addss %xmm1, %xmm0
463; SSE41-NEXT:    retq
464;
465; AVX1-SLOW-LABEL: test_v16f32_zero:
466; AVX1-SLOW:       # %bb.0:
467; AVX1-SLOW-NEXT:    vaddps %ymm1, %ymm0, %ymm0
468; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
469; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
470; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
471; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
472; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
473; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
474; AVX1-SLOW-NEXT:    vzeroupper
475; AVX1-SLOW-NEXT:    retq
476;
477; AVX1-FAST-LABEL: test_v16f32_zero:
478; AVX1-FAST:       # %bb.0:
479; AVX1-FAST-NEXT:    vaddps %ymm1, %ymm0, %ymm0
480; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
481; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
482; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
483; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
484; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
485; AVX1-FAST-NEXT:    vzeroupper
486; AVX1-FAST-NEXT:    retq
487;
488; AVX2-LABEL: test_v16f32_zero:
489; AVX2:       # %bb.0:
490; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
491; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
492; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
493; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
494; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
495; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
496; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
497; AVX2-NEXT:    vzeroupper
498; AVX2-NEXT:    retq
499;
500; AVX512-LABEL: test_v16f32_zero:
501; AVX512:       # %bb.0:
502; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
503; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
504; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
505; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
506; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
507; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
508; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
509; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
510; AVX512-NEXT:    vzeroupper
511; AVX512-NEXT:    retq
512  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
513  ret float %1
514}
515
516;
517; vXf32 (undef)
518;
519
520define float @test_v2f32_undef(<2 x float> %a0) {
521; SSE2-LABEL: test_v2f32_undef:
522; SSE2:       # %bb.0:
523; SSE2-NEXT:    movaps %xmm0, %xmm1
524; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
525; SSE2-NEXT:    addss %xmm1, %xmm0
526; SSE2-NEXT:    retq
527;
528; SSE41-LABEL: test_v2f32_undef:
529; SSE41:       # %bb.0:
530; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
531; SSE41-NEXT:    addss %xmm1, %xmm0
532; SSE41-NEXT:    retq
533;
534; AVX1-SLOW-LABEL: test_v2f32_undef:
535; AVX1-SLOW:       # %bb.0:
536; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
537; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
538; AVX1-SLOW-NEXT:    retq
539;
540; AVX1-FAST-LABEL: test_v2f32_undef:
541; AVX1-FAST:       # %bb.0:
542; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
543; AVX1-FAST-NEXT:    retq
544;
545; AVX2-LABEL: test_v2f32_undef:
546; AVX2:       # %bb.0:
547; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
548; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
549; AVX2-NEXT:    retq
550;
551; AVX512-LABEL: test_v2f32_undef:
552; AVX512:       # %bb.0:
553; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
554; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
555; AVX512-NEXT:    retq
556  %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
557  ret float %1
558}
559
560define float @test_v4f32_undef(<4 x float> %a0) {
561; SSE2-LABEL: test_v4f32_undef:
562; SSE2:       # %bb.0:
563; SSE2-NEXT:    movaps %xmm0, %xmm1
564; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
565; SSE2-NEXT:    addps %xmm1, %xmm0
566; SSE2-NEXT:    movaps %xmm0, %xmm1
567; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
568; SSE2-NEXT:    addss %xmm1, %xmm0
569; SSE2-NEXT:    retq
570;
571; SSE41-LABEL: test_v4f32_undef:
572; SSE41:       # %bb.0:
573; SSE41-NEXT:    movaps %xmm0, %xmm1
574; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
575; SSE41-NEXT:    addps %xmm1, %xmm0
576; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
577; SSE41-NEXT:    addss %xmm1, %xmm0
578; SSE41-NEXT:    retq
579;
580; AVX1-SLOW-LABEL: test_v4f32_undef:
581; AVX1-SLOW:       # %bb.0:
582; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
583; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
584; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
585; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
586; AVX1-SLOW-NEXT:    retq
587;
588; AVX1-FAST-LABEL: test_v4f32_undef:
589; AVX1-FAST:       # %bb.0:
590; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
591; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
592; AVX1-FAST-NEXT:    retq
593;
594; AVX2-LABEL: test_v4f32_undef:
595; AVX2:       # %bb.0:
596; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
597; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
598; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
599; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
600; AVX2-NEXT:    retq
601;
602; AVX512-LABEL: test_v4f32_undef:
603; AVX512:       # %bb.0:
604; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
605; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
606; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
607; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
608; AVX512-NEXT:    retq
609  %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
610  ret float %1
611}
612
613define float @test_v8f32_undef(<8 x float> %a0) {
614; SSE2-LABEL: test_v8f32_undef:
615; SSE2:       # %bb.0:
616; SSE2-NEXT:    addps %xmm1, %xmm0
617; SSE2-NEXT:    movaps %xmm0, %xmm1
618; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
619; SSE2-NEXT:    addps %xmm1, %xmm0
620; SSE2-NEXT:    movaps %xmm0, %xmm1
621; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
622; SSE2-NEXT:    addss %xmm1, %xmm0
623; SSE2-NEXT:    retq
624;
625; SSE41-LABEL: test_v8f32_undef:
626; SSE41:       # %bb.0:
627; SSE41-NEXT:    addps %xmm1, %xmm0
628; SSE41-NEXT:    movaps %xmm0, %xmm1
629; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
630; SSE41-NEXT:    addps %xmm1, %xmm0
631; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
632; SSE41-NEXT:    addss %xmm1, %xmm0
633; SSE41-NEXT:    retq
634;
635; AVX1-SLOW-LABEL: test_v8f32_undef:
636; AVX1-SLOW:       # %bb.0:
637; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
638; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
639; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
640; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
641; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
642; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
643; AVX1-SLOW-NEXT:    vzeroupper
644; AVX1-SLOW-NEXT:    retq
645;
646; AVX1-FAST-LABEL: test_v8f32_undef:
647; AVX1-FAST:       # %bb.0:
648; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
649; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
650; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
651; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
652; AVX1-FAST-NEXT:    vzeroupper
653; AVX1-FAST-NEXT:    retq
654;
655; AVX2-LABEL: test_v8f32_undef:
656; AVX2:       # %bb.0:
657; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
658; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
659; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
660; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
661; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
662; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
663; AVX2-NEXT:    vzeroupper
664; AVX2-NEXT:    retq
665;
666; AVX512-LABEL: test_v8f32_undef:
667; AVX512:       # %bb.0:
668; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
669; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
670; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
671; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
672; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
673; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
674; AVX512-NEXT:    vzeroupper
675; AVX512-NEXT:    retq
676  %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
677  ret float %1
678}
679
680define float @test_v16f32_undef(<16 x float> %a0) {
681; SSE2-LABEL: test_v16f32_undef:
682; SSE2:       # %bb.0:
683; SSE2-NEXT:    addps %xmm3, %xmm1
684; SSE2-NEXT:    addps %xmm2, %xmm0
685; SSE2-NEXT:    addps %xmm1, %xmm0
686; SSE2-NEXT:    movaps %xmm0, %xmm1
687; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
688; SSE2-NEXT:    addps %xmm1, %xmm0
689; SSE2-NEXT:    movaps %xmm0, %xmm1
690; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
691; SSE2-NEXT:    addss %xmm1, %xmm0
692; SSE2-NEXT:    retq
693;
694; SSE41-LABEL: test_v16f32_undef:
695; SSE41:       # %bb.0:
696; SSE41-NEXT:    addps %xmm3, %xmm1
697; SSE41-NEXT:    addps %xmm2, %xmm0
698; SSE41-NEXT:    addps %xmm1, %xmm0
699; SSE41-NEXT:    movaps %xmm0, %xmm1
700; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
701; SSE41-NEXT:    addps %xmm1, %xmm0
702; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
703; SSE41-NEXT:    addss %xmm1, %xmm0
704; SSE41-NEXT:    retq
705;
706; AVX1-SLOW-LABEL: test_v16f32_undef:
707; AVX1-SLOW:       # %bb.0:
708; AVX1-SLOW-NEXT:    vaddps %ymm1, %ymm0, %ymm0
709; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
710; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
711; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
712; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
713; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
714; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
715; AVX1-SLOW-NEXT:    vzeroupper
716; AVX1-SLOW-NEXT:    retq
717;
718; AVX1-FAST-LABEL: test_v16f32_undef:
719; AVX1-FAST:       # %bb.0:
720; AVX1-FAST-NEXT:    vaddps %ymm1, %ymm0, %ymm0
721; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
722; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
723; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
724; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
725; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
726; AVX1-FAST-NEXT:    vzeroupper
727; AVX1-FAST-NEXT:    retq
728;
729; AVX2-LABEL: test_v16f32_undef:
730; AVX2:       # %bb.0:
731; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
732; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
733; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
734; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
735; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
736; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
737; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
738; AVX2-NEXT:    vzeroupper
739; AVX2-NEXT:    retq
740;
741; AVX512-LABEL: test_v16f32_undef:
742; AVX512:       # %bb.0:
743; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
744; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
745; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
746; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
747; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
748; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
749; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
750; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
751; AVX512-NEXT:    vzeroupper
752; AVX512-NEXT:    retq
753  %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
754  ret float %1
755}
756
757;
758; vXf64 (accum)
759;
760
761define double @test_v2f64(double %a0, <2 x double> %a1) {
762; SSE-LABEL: test_v2f64:
763; SSE:       # %bb.0:
764; SSE-NEXT:    movapd %xmm1, %xmm2
765; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
766; SSE-NEXT:    addsd %xmm1, %xmm2
767; SSE-NEXT:    addsd %xmm2, %xmm0
768; SSE-NEXT:    retq
769;
770; AVX1-SLOW-LABEL: test_v2f64:
771; AVX1-SLOW:       # %bb.0:
772; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
773; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
774; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
775; AVX1-SLOW-NEXT:    retq
776;
777; AVX1-FAST-LABEL: test_v2f64:
778; AVX1-FAST:       # %bb.0:
779; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
780; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
781; AVX1-FAST-NEXT:    retq
782;
783; AVX2-LABEL: test_v2f64:
784; AVX2:       # %bb.0:
785; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
786; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
787; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
788; AVX2-NEXT:    retq
789;
790; AVX512-LABEL: test_v2f64:
791; AVX512:       # %bb.0:
792; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
793; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
794; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
795; AVX512-NEXT:    retq
796  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
797  ret double %1
798}
799
800define double @test_v4f64(double %a0, <4 x double> %a1) {
801; SSE-LABEL: test_v4f64:
802; SSE:       # %bb.0:
803; SSE-NEXT:    addpd %xmm2, %xmm1
804; SSE-NEXT:    movapd %xmm1, %xmm2
805; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
806; SSE-NEXT:    addsd %xmm1, %xmm2
807; SSE-NEXT:    addsd %xmm2, %xmm0
808; SSE-NEXT:    retq
809;
810; AVX1-SLOW-LABEL: test_v4f64:
811; AVX1-SLOW:       # %bb.0:
812; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
813; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
814; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
815; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
816; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
817; AVX1-SLOW-NEXT:    vzeroupper
818; AVX1-SLOW-NEXT:    retq
819;
820; AVX1-FAST-LABEL: test_v4f64:
821; AVX1-FAST:       # %bb.0:
822; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
823; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm2, %xmm1
824; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
825; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
826; AVX1-FAST-NEXT:    vzeroupper
827; AVX1-FAST-NEXT:    retq
828;
829; AVX2-LABEL: test_v4f64:
830; AVX2:       # %bb.0:
831; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
832; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
833; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
834; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
835; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
836; AVX2-NEXT:    vzeroupper
837; AVX2-NEXT:    retq
838;
839; AVX512-LABEL: test_v4f64:
840; AVX512:       # %bb.0:
841; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
842; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
843; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
844; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
845; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
846; AVX512-NEXT:    vzeroupper
847; AVX512-NEXT:    retq
848  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
849  ret double %1
850}
851
852define double @test_v8f64(double %a0, <8 x double> %a1) {
853; SSE-LABEL: test_v8f64:
854; SSE:       # %bb.0:
855; SSE-NEXT:    addpd %xmm4, %xmm2
856; SSE-NEXT:    addpd %xmm3, %xmm1
857; SSE-NEXT:    addpd %xmm2, %xmm1
858; SSE-NEXT:    movapd %xmm1, %xmm2
859; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
860; SSE-NEXT:    addsd %xmm1, %xmm2
861; SSE-NEXT:    addsd %xmm2, %xmm0
862; SSE-NEXT:    retq
863;
864; AVX1-SLOW-LABEL: test_v8f64:
865; AVX1-SLOW:       # %bb.0:
866; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
867; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
868; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
869; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
870; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
871; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
872; AVX1-SLOW-NEXT:    vzeroupper
873; AVX1-SLOW-NEXT:    retq
874;
875; AVX1-FAST-LABEL: test_v8f64:
876; AVX1-FAST:       # %bb.0:
877; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
878; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
879; AVX1-FAST-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
880; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
881; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
882; AVX1-FAST-NEXT:    vzeroupper
883; AVX1-FAST-NEXT:    retq
884;
885; AVX2-LABEL: test_v8f64:
886; AVX2:       # %bb.0:
887; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
888; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
889; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
890; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
891; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
892; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
893; AVX2-NEXT:    vzeroupper
894; AVX2-NEXT:    retq
895;
896; AVX512-LABEL: test_v8f64:
897; AVX512:       # %bb.0:
898; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
899; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
900; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
901; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
902; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
903; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
904; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
905; AVX512-NEXT:    vzeroupper
906; AVX512-NEXT:    retq
907  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
908  ret double %1
909}
910
911define double @test_v16f64(double %a0, <16 x double> %a1) {
912; SSE-LABEL: test_v16f64:
913; SSE:       # %bb.0:
914; SSE-NEXT:    addpd %xmm6, %xmm2
915; SSE-NEXT:    addpd %xmm7, %xmm3
916; SSE-NEXT:    addpd %xmm5, %xmm1
917; SSE-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
918; SSE-NEXT:    addpd %xmm3, %xmm1
919; SSE-NEXT:    addpd %xmm2, %xmm4
920; SSE-NEXT:    addpd %xmm1, %xmm4
921; SSE-NEXT:    movapd %xmm4, %xmm1
922; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
923; SSE-NEXT:    addsd %xmm4, %xmm1
924; SSE-NEXT:    addsd %xmm1, %xmm0
925; SSE-NEXT:    retq
926;
927; AVX1-SLOW-LABEL: test_v16f64:
928; AVX1-SLOW:       # %bb.0:
929; AVX1-SLOW-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
930; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
931; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
932; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
933; AVX1-SLOW-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
934; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
935; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
936; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
937; AVX1-SLOW-NEXT:    vzeroupper
938; AVX1-SLOW-NEXT:    retq
939;
940; AVX1-FAST-LABEL: test_v16f64:
941; AVX1-FAST:       # %bb.0:
942; AVX1-FAST-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
943; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
944; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
945; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
946; AVX1-FAST-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
947; AVX1-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
948; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
949; AVX1-FAST-NEXT:    vzeroupper
950; AVX1-FAST-NEXT:    retq
951;
952; AVX2-LABEL: test_v16f64:
953; AVX2:       # %bb.0:
954; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
955; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
956; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
957; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
958; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
959; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
960; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
961; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
962; AVX2-NEXT:    vzeroupper
963; AVX2-NEXT:    retq
964;
965; AVX512-LABEL: test_v16f64:
966; AVX512:       # %bb.0:
967; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
968; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
969; AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
970; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
971; AVX512-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
972; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
973; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
974; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
975; AVX512-NEXT:    vzeroupper
976; AVX512-NEXT:    retq
977  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
978  ret double %1
979}
980
981;
982; vXf64 (zero)
983;
984
985define double @test_v2f64_zero(<2 x double> %a0) {
986; SSE-LABEL: test_v2f64_zero:
987; SSE:       # %bb.0:
988; SSE-NEXT:    movapd %xmm0, %xmm1
989; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
990; SSE-NEXT:    addsd %xmm1, %xmm0
991; SSE-NEXT:    retq
992;
993; AVX1-SLOW-LABEL: test_v2f64_zero:
994; AVX1-SLOW:       # %bb.0:
995; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
996; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
997; AVX1-SLOW-NEXT:    retq
998;
999; AVX1-FAST-LABEL: test_v2f64_zero:
1000; AVX1-FAST:       # %bb.0:
1001; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1002; AVX1-FAST-NEXT:    retq
1003;
1004; AVX2-LABEL: test_v2f64_zero:
1005; AVX2:       # %bb.0:
1006; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1007; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1008; AVX2-NEXT:    retq
1009;
1010; AVX512-LABEL: test_v2f64_zero:
1011; AVX512:       # %bb.0:
1012; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1013; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1014; AVX512-NEXT:    retq
1015  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1016  ret double %1
1017}
1018
1019define double @test_v4f64_zero(<4 x double> %a0) {
1020; SSE-LABEL: test_v4f64_zero:
1021; SSE:       # %bb.0:
1022; SSE-NEXT:    addpd %xmm1, %xmm0
1023; SSE-NEXT:    movapd %xmm0, %xmm1
1024; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1025; SSE-NEXT:    addsd %xmm1, %xmm0
1026; SSE-NEXT:    retq
1027;
1028; AVX1-SLOW-LABEL: test_v4f64_zero:
1029; AVX1-SLOW:       # %bb.0:
1030; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1031; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1032; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1033; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1034; AVX1-SLOW-NEXT:    vzeroupper
1035; AVX1-SLOW-NEXT:    retq
1036;
1037; AVX1-FAST-LABEL: test_v4f64_zero:
1038; AVX1-FAST:       # %bb.0:
1039; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1040; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
1041; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1042; AVX1-FAST-NEXT:    vzeroupper
1043; AVX1-FAST-NEXT:    retq
1044;
1045; AVX2-LABEL: test_v4f64_zero:
1046; AVX2:       # %bb.0:
1047; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1048; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1049; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1050; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1051; AVX2-NEXT:    vzeroupper
1052; AVX2-NEXT:    retq
1053;
1054; AVX512-LABEL: test_v4f64_zero:
1055; AVX512:       # %bb.0:
1056; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1057; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1058; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1059; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1060; AVX512-NEXT:    vzeroupper
1061; AVX512-NEXT:    retq
1062  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1063  ret double %1
1064}
1065
1066define double @test_v8f64_zero(<8 x double> %a0) {
1067; SSE-LABEL: test_v8f64_zero:
1068; SSE:       # %bb.0:
1069; SSE-NEXT:    addpd %xmm3, %xmm1
1070; SSE-NEXT:    addpd %xmm2, %xmm0
1071; SSE-NEXT:    addpd %xmm1, %xmm0
1072; SSE-NEXT:    movapd %xmm0, %xmm1
1073; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1074; SSE-NEXT:    addsd %xmm1, %xmm0
1075; SSE-NEXT:    retq
1076;
1077; AVX1-SLOW-LABEL: test_v8f64_zero:
1078; AVX1-SLOW:       # %bb.0:
1079; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1080; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1081; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1082; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1083; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1084; AVX1-SLOW-NEXT:    vzeroupper
1085; AVX1-SLOW-NEXT:    retq
1086;
1087; AVX1-FAST-LABEL: test_v8f64_zero:
1088; AVX1-FAST:       # %bb.0:
1089; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1090; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1091; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1092; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1093; AVX1-FAST-NEXT:    vzeroupper
1094; AVX1-FAST-NEXT:    retq
1095;
1096; AVX2-LABEL: test_v8f64_zero:
1097; AVX2:       # %bb.0:
1098; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1099; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1100; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1101; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1102; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1103; AVX2-NEXT:    vzeroupper
1104; AVX2-NEXT:    retq
1105;
1106; AVX512-LABEL: test_v8f64_zero:
1107; AVX512:       # %bb.0:
1108; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1109; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1110; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1111; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1112; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1113; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1114; AVX512-NEXT:    vzeroupper
1115; AVX512-NEXT:    retq
1116  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1117  ret double %1
1118}
1119
1120define double @test_v16f64_zero(<16 x double> %a0) {
1121; SSE-LABEL: test_v16f64_zero:
1122; SSE:       # %bb.0:
1123; SSE-NEXT:    addpd %xmm6, %xmm2
1124; SSE-NEXT:    addpd %xmm4, %xmm0
1125; SSE-NEXT:    addpd %xmm2, %xmm0
1126; SSE-NEXT:    addpd %xmm7, %xmm3
1127; SSE-NEXT:    addpd %xmm5, %xmm1
1128; SSE-NEXT:    addpd %xmm3, %xmm1
1129; SSE-NEXT:    addpd %xmm1, %xmm0
1130; SSE-NEXT:    movapd %xmm0, %xmm1
1131; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1132; SSE-NEXT:    addsd %xmm1, %xmm0
1133; SSE-NEXT:    retq
1134;
1135; AVX1-SLOW-LABEL: test_v16f64_zero:
1136; AVX1-SLOW:       # %bb.0:
1137; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1138; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1139; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1140; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1141; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1142; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1143; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1144; AVX1-SLOW-NEXT:    vzeroupper
1145; AVX1-SLOW-NEXT:    retq
1146;
1147; AVX1-FAST-LABEL: test_v16f64_zero:
1148; AVX1-FAST:       # %bb.0:
1149; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1150; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1151; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1152; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1153; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1154; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1155; AVX1-FAST-NEXT:    vzeroupper
1156; AVX1-FAST-NEXT:    retq
1157;
1158; AVX2-LABEL: test_v16f64_zero:
1159; AVX2:       # %bb.0:
1160; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1161; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1162; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1163; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1164; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1165; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1166; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1167; AVX2-NEXT:    vzeroupper
1168; AVX2-NEXT:    retq
1169;
1170; AVX512-LABEL: test_v16f64_zero:
1171; AVX512:       # %bb.0:
1172; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1173; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1174; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1175; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1176; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1177; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1178; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1179; AVX512-NEXT:    vzeroupper
1180; AVX512-NEXT:    retq
1181  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1182  ret double %1
1183}
1184
1185;
1186; vXf64 (undef)
1187;
1188
1189define double @test_v2f64_undef(<2 x double> %a0) {
1190; SSE-LABEL: test_v2f64_undef:
1191; SSE:       # %bb.0:
1192; SSE-NEXT:    movapd %xmm0, %xmm1
1193; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1194; SSE-NEXT:    addsd %xmm1, %xmm0
1195; SSE-NEXT:    retq
1196;
1197; AVX1-SLOW-LABEL: test_v2f64_undef:
1198; AVX1-SLOW:       # %bb.0:
1199; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1200; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1201; AVX1-SLOW-NEXT:    retq
1202;
1203; AVX1-FAST-LABEL: test_v2f64_undef:
1204; AVX1-FAST:       # %bb.0:
1205; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1206; AVX1-FAST-NEXT:    retq
1207;
1208; AVX2-LABEL: test_v2f64_undef:
1209; AVX2:       # %bb.0:
1210; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1211; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1212; AVX2-NEXT:    retq
1213;
1214; AVX512-LABEL: test_v2f64_undef:
1215; AVX512:       # %bb.0:
1216; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1217; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1218; AVX512-NEXT:    retq
1219  %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
1220  ret double %1
1221}
1222
1223define double @test_v4f64_undef(<4 x double> %a0) {
1224; SSE-LABEL: test_v4f64_undef:
1225; SSE:       # %bb.0:
1226; SSE-NEXT:    addpd %xmm1, %xmm0
1227; SSE-NEXT:    movapd %xmm0, %xmm1
1228; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1229; SSE-NEXT:    addsd %xmm1, %xmm0
1230; SSE-NEXT:    retq
1231;
1232; AVX1-SLOW-LABEL: test_v4f64_undef:
1233; AVX1-SLOW:       # %bb.0:
1234; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1235; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1236; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1237; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1238; AVX1-SLOW-NEXT:    vzeroupper
1239; AVX1-SLOW-NEXT:    retq
1240;
1241; AVX1-FAST-LABEL: test_v4f64_undef:
1242; AVX1-FAST:       # %bb.0:
1243; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1244; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
1245; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1246; AVX1-FAST-NEXT:    vzeroupper
1247; AVX1-FAST-NEXT:    retq
1248;
1249; AVX2-LABEL: test_v4f64_undef:
1250; AVX2:       # %bb.0:
1251; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1252; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1253; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1254; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1255; AVX2-NEXT:    vzeroupper
1256; AVX2-NEXT:    retq
1257;
1258; AVX512-LABEL: test_v4f64_undef:
1259; AVX512:       # %bb.0:
1260; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1261; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1262; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1263; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1264; AVX512-NEXT:    vzeroupper
1265; AVX512-NEXT:    retq
1266  %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
1267  ret double %1
1268}
1269
1270define double @test_v8f64_undef(<8 x double> %a0) {
1271; SSE-LABEL: test_v8f64_undef:
1272; SSE:       # %bb.0:
1273; SSE-NEXT:    addpd %xmm3, %xmm1
1274; SSE-NEXT:    addpd %xmm2, %xmm0
1275; SSE-NEXT:    addpd %xmm1, %xmm0
1276; SSE-NEXT:    movapd %xmm0, %xmm1
1277; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1278; SSE-NEXT:    addsd %xmm1, %xmm0
1279; SSE-NEXT:    retq
1280;
1281; AVX1-SLOW-LABEL: test_v8f64_undef:
1282; AVX1-SLOW:       # %bb.0:
1283; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1284; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1285; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1286; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1287; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1288; AVX1-SLOW-NEXT:    vzeroupper
1289; AVX1-SLOW-NEXT:    retq
1290;
1291; AVX1-FAST-LABEL: test_v8f64_undef:
1292; AVX1-FAST:       # %bb.0:
1293; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1294; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1295; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1296; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1297; AVX1-FAST-NEXT:    vzeroupper
1298; AVX1-FAST-NEXT:    retq
1299;
1300; AVX2-LABEL: test_v8f64_undef:
1301; AVX2:       # %bb.0:
1302; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1303; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1304; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1305; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1306; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1307; AVX2-NEXT:    vzeroupper
1308; AVX2-NEXT:    retq
1309;
1310; AVX512-LABEL: test_v8f64_undef:
1311; AVX512:       # %bb.0:
1312; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1313; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1314; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1315; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1316; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1317; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1318; AVX512-NEXT:    vzeroupper
1319; AVX512-NEXT:    retq
1320  %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
1321  ret double %1
1322}
1323
1324define double @test_v16f64_undef(<16 x double> %a0) {
1325; SSE-LABEL: test_v16f64_undef:
1326; SSE:       # %bb.0:
1327; SSE-NEXT:    addpd %xmm6, %xmm2
1328; SSE-NEXT:    addpd %xmm4, %xmm0
1329; SSE-NEXT:    addpd %xmm2, %xmm0
1330; SSE-NEXT:    addpd %xmm7, %xmm3
1331; SSE-NEXT:    addpd %xmm5, %xmm1
1332; SSE-NEXT:    addpd %xmm3, %xmm1
1333; SSE-NEXT:    addpd %xmm1, %xmm0
1334; SSE-NEXT:    movapd %xmm0, %xmm1
1335; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1336; SSE-NEXT:    addsd %xmm1, %xmm0
1337; SSE-NEXT:    retq
1338;
1339; AVX1-SLOW-LABEL: test_v16f64_undef:
1340; AVX1-SLOW:       # %bb.0:
1341; AVX1-SLOW-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1342; AVX1-SLOW-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1343; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1344; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1345; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1346; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1347; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1348; AVX1-SLOW-NEXT:    vzeroupper
1349; AVX1-SLOW-NEXT:    retq
1350;
1351; AVX1-FAST-LABEL: test_v16f64_undef:
1352; AVX1-FAST:       # %bb.0:
1353; AVX1-FAST-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1354; AVX1-FAST-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1355; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1356; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1357; AVX1-FAST-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1358; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
1359; AVX1-FAST-NEXT:    vzeroupper
1360; AVX1-FAST-NEXT:    retq
1361;
1362; AVX2-LABEL: test_v16f64_undef:
1363; AVX2:       # %bb.0:
1364; AVX2-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1365; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1366; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1367; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1368; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1369; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1370; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1371; AVX2-NEXT:    vzeroupper
1372; AVX2-NEXT:    retq
1373;
1374; AVX512-LABEL: test_v16f64_undef:
1375; AVX512:       # %bb.0:
1376; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1377; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1378; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
1379; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1380; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
1381; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1382; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1383; AVX512-NEXT:    vzeroupper
1384; AVX512-NEXT:    retq
1385  %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
1386  ret double %1
1387}
1388
1389declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
1390declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1391declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1392declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
1393
1394declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
1395declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1396declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
1397declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
1398