xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movaps %xmm1, %xmm2
17; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
18; SSE2-NEXT:    mulss %xmm1, %xmm2
19; SSE2-NEXT:    mulss %xmm2, %xmm0
20; SSE2-NEXT:    retq
21;
22; SSE41-LABEL: test_v2f32:
23; SSE41:       # %bb.0:
24; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
25; SSE41-NEXT:    mulss %xmm1, %xmm2
26; SSE41-NEXT:    mulss %xmm2, %xmm0
27; SSE41-NEXT:    retq
28;
29; AVX-LABEL: test_v2f32:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
32; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
33; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; AVX512-LABEL: test_v2f32:
37; AVX512:       # %bb.0:
38; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
39; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
40; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
43  ret float %1
44}
45
46define float @test_v4f32(float %a0, <4 x float> %a1) {
47; SSE2-LABEL: test_v4f32:
48; SSE2:       # %bb.0:
49; SSE2-NEXT:    movaps %xmm1, %xmm2
50; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
51; SSE2-NEXT:    mulps %xmm1, %xmm2
52; SSE2-NEXT:    movaps %xmm2, %xmm1
53; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
54; SSE2-NEXT:    mulss %xmm2, %xmm1
55; SSE2-NEXT:    mulss %xmm1, %xmm0
56; SSE2-NEXT:    retq
57;
58; SSE41-LABEL: test_v4f32:
59; SSE41:       # %bb.0:
60; SSE41-NEXT:    movaps %xmm1, %xmm2
61; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
62; SSE41-NEXT:    mulps %xmm1, %xmm2
63; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
64; SSE41-NEXT:    mulss %xmm2, %xmm1
65; SSE41-NEXT:    mulss %xmm1, %xmm0
66; SSE41-NEXT:    retq
67;
68; AVX-LABEL: test_v4f32:
69; AVX:       # %bb.0:
70; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
71; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
72; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
73; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
74; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    retq
76;
77; AVX512-LABEL: test_v4f32:
78; AVX512:       # %bb.0:
79; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
80; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
81; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
82; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
83; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
84; AVX512-NEXT:    retq
85  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
86  ret float %1
87}
88
89define float @test_v8f32(float %a0, <8 x float> %a1) {
90; SSE2-LABEL: test_v8f32:
91; SSE2:       # %bb.0:
92; SSE2-NEXT:    mulps %xmm2, %xmm1
93; SSE2-NEXT:    movaps %xmm1, %xmm2
94; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
95; SSE2-NEXT:    mulps %xmm1, %xmm2
96; SSE2-NEXT:    movaps %xmm2, %xmm1
97; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
98; SSE2-NEXT:    mulss %xmm2, %xmm1
99; SSE2-NEXT:    mulss %xmm1, %xmm0
100; SSE2-NEXT:    retq
101;
102; SSE41-LABEL: test_v8f32:
103; SSE41:       # %bb.0:
104; SSE41-NEXT:    mulps %xmm2, %xmm1
105; SSE41-NEXT:    movaps %xmm1, %xmm2
106; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
107; SSE41-NEXT:    mulps %xmm1, %xmm2
108; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
109; SSE41-NEXT:    mulss %xmm2, %xmm1
110; SSE41-NEXT:    mulss %xmm1, %xmm0
111; SSE41-NEXT:    retq
112;
113; AVX-LABEL: test_v8f32:
114; AVX:       # %bb.0:
115; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
116; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
117; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
118; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
119; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
120; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
121; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
122; AVX-NEXT:    vzeroupper
123; AVX-NEXT:    retq
124;
125; AVX512-LABEL: test_v8f32:
126; AVX512:       # %bb.0:
127; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
128; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
129; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
130; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
131; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
132; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
133; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
134; AVX512-NEXT:    vzeroupper
135; AVX512-NEXT:    retq
136  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
137  ret float %1
138}
139
140define float @test_v16f32(float %a0, <16 x float> %a1) {
141; SSE2-LABEL: test_v16f32:
142; SSE2:       # %bb.0:
143; SSE2-NEXT:    mulps %xmm4, %xmm2
144; SSE2-NEXT:    mulps %xmm3, %xmm1
145; SSE2-NEXT:    mulps %xmm2, %xmm1
146; SSE2-NEXT:    movaps %xmm1, %xmm2
147; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
148; SSE2-NEXT:    mulps %xmm1, %xmm2
149; SSE2-NEXT:    movaps %xmm2, %xmm1
150; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
151; SSE2-NEXT:    mulss %xmm2, %xmm1
152; SSE2-NEXT:    mulss %xmm1, %xmm0
153; SSE2-NEXT:    retq
154;
155; SSE41-LABEL: test_v16f32:
156; SSE41:       # %bb.0:
157; SSE41-NEXT:    mulps %xmm4, %xmm2
158; SSE41-NEXT:    mulps %xmm3, %xmm1
159; SSE41-NEXT:    mulps %xmm2, %xmm1
160; SSE41-NEXT:    movaps %xmm1, %xmm2
161; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
162; SSE41-NEXT:    mulps %xmm1, %xmm2
163; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
164; SSE41-NEXT:    mulss %xmm2, %xmm1
165; SSE41-NEXT:    mulss %xmm1, %xmm0
166; SSE41-NEXT:    retq
167;
168; AVX-LABEL: test_v16f32:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm1
171; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
172; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
173; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
174; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
175; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
176; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
177; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
178; AVX-NEXT:    vzeroupper
179; AVX-NEXT:    retq
180;
181; AVX512-LABEL: test_v16f32:
182; AVX512:       # %bb.0:
183; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
184; AVX512-NEXT:    vmulps %zmm2, %zmm1, %zmm1
185; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
186; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
187; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
188; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
189; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
190; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
191; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
192; AVX512-NEXT:    vzeroupper
193; AVX512-NEXT:    retq
194  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
195  ret float %1
196}
197
198;
199; vXf32 (one)
200;
201
202define float @test_v2f32_zero(<2 x float> %a0) {
203; SSE2-LABEL: test_v2f32_zero:
204; SSE2:       # %bb.0:
205; SSE2-NEXT:    movaps %xmm0, %xmm1
206; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
207; SSE2-NEXT:    mulss %xmm1, %xmm0
208; SSE2-NEXT:    retq
209;
210; SSE41-LABEL: test_v2f32_zero:
211; SSE41:       # %bb.0:
212; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
213; SSE41-NEXT:    mulss %xmm1, %xmm0
214; SSE41-NEXT:    retq
215;
216; AVX-LABEL: test_v2f32_zero:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
219; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
220; AVX-NEXT:    retq
221;
222; AVX512-LABEL: test_v2f32_zero:
223; AVX512:       # %bb.0:
224; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
225; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
226; AVX512-NEXT:    retq
227  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
228  ret float %1
229}
230
231define float @test_v4f32_zero(<4 x float> %a0) {
232; SSE2-LABEL: test_v4f32_zero:
233; SSE2:       # %bb.0:
234; SSE2-NEXT:    movaps %xmm0, %xmm1
235; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
236; SSE2-NEXT:    mulps %xmm1, %xmm0
237; SSE2-NEXT:    movaps %xmm0, %xmm1
238; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
239; SSE2-NEXT:    mulss %xmm1, %xmm0
240; SSE2-NEXT:    retq
241;
242; SSE41-LABEL: test_v4f32_zero:
243; SSE41:       # %bb.0:
244; SSE41-NEXT:    movaps %xmm0, %xmm1
245; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
246; SSE41-NEXT:    mulps %xmm1, %xmm0
247; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
248; SSE41-NEXT:    mulss %xmm1, %xmm0
249; SSE41-NEXT:    retq
250;
251; AVX-LABEL: test_v4f32_zero:
252; AVX:       # %bb.0:
253; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
254; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
255; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
256; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
257; AVX-NEXT:    retq
258;
259; AVX512-LABEL: test_v4f32_zero:
260; AVX512:       # %bb.0:
261; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
262; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
263; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
264; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
265; AVX512-NEXT:    retq
266  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
267  ret float %1
268}
269
270define float @test_v8f32_zero(<8 x float> %a0) {
271; SSE2-LABEL: test_v8f32_zero:
272; SSE2:       # %bb.0:
273; SSE2-NEXT:    mulps %xmm1, %xmm0
274; SSE2-NEXT:    movaps %xmm0, %xmm1
275; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
276; SSE2-NEXT:    mulps %xmm1, %xmm0
277; SSE2-NEXT:    movaps %xmm0, %xmm1
278; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
279; SSE2-NEXT:    mulss %xmm1, %xmm0
280; SSE2-NEXT:    retq
281;
282; SSE41-LABEL: test_v8f32_zero:
283; SSE41:       # %bb.0:
284; SSE41-NEXT:    mulps %xmm1, %xmm0
285; SSE41-NEXT:    movaps %xmm0, %xmm1
286; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
287; SSE41-NEXT:    mulps %xmm1, %xmm0
288; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
289; SSE41-NEXT:    mulss %xmm1, %xmm0
290; SSE41-NEXT:    retq
291;
292; AVX-LABEL: test_v8f32_zero:
293; AVX:       # %bb.0:
294; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
295; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
296; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
297; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
298; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
299; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
300; AVX-NEXT:    vzeroupper
301; AVX-NEXT:    retq
302;
303; AVX512-LABEL: test_v8f32_zero:
304; AVX512:       # %bb.0:
305; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
306; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
307; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
308; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
309; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
310; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
311; AVX512-NEXT:    vzeroupper
312; AVX512-NEXT:    retq
313  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
314  ret float %1
315}
316
317define float @test_v16f32_zero(<16 x float> %a0) {
318; SSE2-LABEL: test_v16f32_zero:
319; SSE2:       # %bb.0:
320; SSE2-NEXT:    mulps %xmm3, %xmm1
321; SSE2-NEXT:    mulps %xmm2, %xmm0
322; SSE2-NEXT:    mulps %xmm1, %xmm0
323; SSE2-NEXT:    movaps %xmm0, %xmm1
324; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
325; SSE2-NEXT:    mulps %xmm1, %xmm0
326; SSE2-NEXT:    movaps %xmm0, %xmm1
327; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
328; SSE2-NEXT:    mulss %xmm1, %xmm0
329; SSE2-NEXT:    retq
330;
331; SSE41-LABEL: test_v16f32_zero:
332; SSE41:       # %bb.0:
333; SSE41-NEXT:    mulps %xmm3, %xmm1
334; SSE41-NEXT:    mulps %xmm2, %xmm0
335; SSE41-NEXT:    mulps %xmm1, %xmm0
336; SSE41-NEXT:    movaps %xmm0, %xmm1
337; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
338; SSE41-NEXT:    mulps %xmm1, %xmm0
339; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
340; SSE41-NEXT:    mulss %xmm1, %xmm0
341; SSE41-NEXT:    retq
342;
343; AVX-LABEL: test_v16f32_zero:
344; AVX:       # %bb.0:
345; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
346; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
347; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
348; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
349; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
350; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
351; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
352; AVX-NEXT:    vzeroupper
353; AVX-NEXT:    retq
354;
355; AVX512-LABEL: test_v16f32_zero:
356; AVX512:       # %bb.0:
357; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
358; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
359; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
360; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
361; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
362; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
363; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
364; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
365; AVX512-NEXT:    vzeroupper
366; AVX512-NEXT:    retq
367  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
368  ret float %1
369}
370
371;
372; vXf32 (undef)
373;
374
375define float @test_v2f32_undef(<2 x float> %a0) {
376; SSE2-LABEL: test_v2f32_undef:
377; SSE2:       # %bb.0:
378; SSE2-NEXT:    movaps %xmm0, %xmm1
379; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
380; SSE2-NEXT:    mulss %xmm1, %xmm0
381; SSE2-NEXT:    retq
382;
383; SSE41-LABEL: test_v2f32_undef:
384; SSE41:       # %bb.0:
385; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
386; SSE41-NEXT:    mulss %xmm1, %xmm0
387; SSE41-NEXT:    retq
388;
389; AVX-LABEL: test_v2f32_undef:
390; AVX:       # %bb.0:
391; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
392; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
393; AVX-NEXT:    retq
394;
395; AVX512-LABEL: test_v2f32_undef:
396; AVX512:       # %bb.0:
397; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
398; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
399; AVX512-NEXT:    retq
400  %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
401  ret float %1
402}
403
404define float @test_v4f32_undef(<4 x float> %a0) {
405; SSE2-LABEL: test_v4f32_undef:
406; SSE2:       # %bb.0:
407; SSE2-NEXT:    movaps %xmm0, %xmm1
408; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
409; SSE2-NEXT:    mulps %xmm1, %xmm0
410; SSE2-NEXT:    movaps %xmm0, %xmm1
411; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
412; SSE2-NEXT:    mulss %xmm1, %xmm0
413; SSE2-NEXT:    retq
414;
415; SSE41-LABEL: test_v4f32_undef:
416; SSE41:       # %bb.0:
417; SSE41-NEXT:    movaps %xmm0, %xmm1
418; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
419; SSE41-NEXT:    mulps %xmm1, %xmm0
420; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
421; SSE41-NEXT:    mulss %xmm1, %xmm0
422; SSE41-NEXT:    retq
423;
424; AVX-LABEL: test_v4f32_undef:
425; AVX:       # %bb.0:
426; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
427; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
428; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
429; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
430; AVX-NEXT:    retq
431;
432; AVX512-LABEL: test_v4f32_undef:
433; AVX512:       # %bb.0:
434; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
435; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
436; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
437; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
438; AVX512-NEXT:    retq
439  %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
440  ret float %1
441}
442
443define float @test_v8f32_undef(<8 x float> %a0) {
444; SSE2-LABEL: test_v8f32_undef:
445; SSE2:       # %bb.0:
446; SSE2-NEXT:    mulps %xmm1, %xmm0
447; SSE2-NEXT:    movaps %xmm0, %xmm1
448; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
449; SSE2-NEXT:    mulps %xmm1, %xmm0
450; SSE2-NEXT:    movaps %xmm0, %xmm1
451; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
452; SSE2-NEXT:    mulss %xmm1, %xmm0
453; SSE2-NEXT:    retq
454;
455; SSE41-LABEL: test_v8f32_undef:
456; SSE41:       # %bb.0:
457; SSE41-NEXT:    mulps %xmm1, %xmm0
458; SSE41-NEXT:    movaps %xmm0, %xmm1
459; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
460; SSE41-NEXT:    mulps %xmm1, %xmm0
461; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
462; SSE41-NEXT:    mulss %xmm1, %xmm0
463; SSE41-NEXT:    retq
464;
465; AVX-LABEL: test_v8f32_undef:
466; AVX:       # %bb.0:
467; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
468; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
469; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
470; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
471; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
472; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
473; AVX-NEXT:    vzeroupper
474; AVX-NEXT:    retq
475;
476; AVX512-LABEL: test_v8f32_undef:
477; AVX512:       # %bb.0:
478; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
479; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
480; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
481; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
482; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
483; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
484; AVX512-NEXT:    vzeroupper
485; AVX512-NEXT:    retq
486  %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
487  ret float %1
488}
489
490define float @test_v16f32_undef(<16 x float> %a0) {
491; SSE2-LABEL: test_v16f32_undef:
492; SSE2:       # %bb.0:
493; SSE2-NEXT:    mulps %xmm3, %xmm1
494; SSE2-NEXT:    mulps %xmm2, %xmm0
495; SSE2-NEXT:    mulps %xmm1, %xmm0
496; SSE2-NEXT:    movaps %xmm0, %xmm1
497; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
498; SSE2-NEXT:    mulps %xmm1, %xmm0
499; SSE2-NEXT:    movaps %xmm0, %xmm1
500; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
501; SSE2-NEXT:    mulss %xmm1, %xmm0
502; SSE2-NEXT:    retq
503;
504; SSE41-LABEL: test_v16f32_undef:
505; SSE41:       # %bb.0:
506; SSE41-NEXT:    mulps %xmm3, %xmm1
507; SSE41-NEXT:    mulps %xmm2, %xmm0
508; SSE41-NEXT:    mulps %xmm1, %xmm0
509; SSE41-NEXT:    movaps %xmm0, %xmm1
510; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
511; SSE41-NEXT:    mulps %xmm1, %xmm0
512; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
513; SSE41-NEXT:    mulss %xmm1, %xmm0
514; SSE41-NEXT:    retq
515;
516; AVX-LABEL: test_v16f32_undef:
517; AVX:       # %bb.0:
518; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
519; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
520; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
521; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
522; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
523; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
524; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
525; AVX-NEXT:    vzeroupper
526; AVX-NEXT:    retq
527;
528; AVX512-LABEL: test_v16f32_undef:
529; AVX512:       # %bb.0:
530; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
531; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
532; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
533; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
534; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
535; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
536; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
537; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
538; AVX512-NEXT:    vzeroupper
539; AVX512-NEXT:    retq
540  %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
541  ret float %1
542}
543
544;
545; vXf64 (accum)
546;
547
548define double @test_v2f64(double %a0, <2 x double> %a1) {
549; SSE-LABEL: test_v2f64:
550; SSE:       # %bb.0:
551; SSE-NEXT:    movapd %xmm1, %xmm2
552; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
553; SSE-NEXT:    mulsd %xmm1, %xmm2
554; SSE-NEXT:    mulsd %xmm2, %xmm0
555; SSE-NEXT:    retq
556;
557; AVX-LABEL: test_v2f64:
558; AVX:       # %bb.0:
559; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
560; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
561; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
562; AVX-NEXT:    retq
563;
564; AVX512-LABEL: test_v2f64:
565; AVX512:       # %bb.0:
566; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
567; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
568; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
569; AVX512-NEXT:    retq
570  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
571  ret double %1
572}
573
574define double @test_v4f64(double %a0, <4 x double> %a1) {
575; SSE-LABEL: test_v4f64:
576; SSE:       # %bb.0:
577; SSE-NEXT:    mulpd %xmm2, %xmm1
578; SSE-NEXT:    movapd %xmm1, %xmm2
579; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
580; SSE-NEXT:    mulsd %xmm1, %xmm2
581; SSE-NEXT:    mulsd %xmm2, %xmm0
582; SSE-NEXT:    retq
583;
584; AVX-LABEL: test_v4f64:
585; AVX:       # %bb.0:
586; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
587; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
588; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
589; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
590; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
591; AVX-NEXT:    vzeroupper
592; AVX-NEXT:    retq
593;
594; AVX512-LABEL: test_v4f64:
595; AVX512:       # %bb.0:
596; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
597; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
598; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
599; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
600; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
601; AVX512-NEXT:    vzeroupper
602; AVX512-NEXT:    retq
603  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
604  ret double %1
605}
606
607define double @test_v8f64(double %a0, <8 x double> %a1) {
608; SSE-LABEL: test_v8f64:
609; SSE:       # %bb.0:
610; SSE-NEXT:    mulpd %xmm4, %xmm2
611; SSE-NEXT:    mulpd %xmm3, %xmm1
612; SSE-NEXT:    mulpd %xmm2, %xmm1
613; SSE-NEXT:    movapd %xmm1, %xmm2
614; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
615; SSE-NEXT:    mulsd %xmm1, %xmm2
616; SSE-NEXT:    mulsd %xmm2, %xmm0
617; SSE-NEXT:    retq
618;
619; AVX-LABEL: test_v8f64:
620; AVX:       # %bb.0:
621; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
622; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
623; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
624; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
625; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
626; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
627; AVX-NEXT:    vzeroupper
628; AVX-NEXT:    retq
629;
630; AVX512-LABEL: test_v8f64:
631; AVX512:       # %bb.0:
632; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
633; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
634; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
635; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
636; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
637; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
638; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
639; AVX512-NEXT:    vzeroupper
640; AVX512-NEXT:    retq
641  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
642  ret double %1
643}
644
645define double @test_v16f64(double %a0, <16 x double> %a1) {
646; SSE-LABEL: test_v16f64:
647; SSE:       # %bb.0:
648; SSE-NEXT:    mulpd %xmm6, %xmm2
649; SSE-NEXT:    mulpd %xmm7, %xmm3
650; SSE-NEXT:    mulpd %xmm5, %xmm1
651; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
652; SSE-NEXT:    mulpd %xmm3, %xmm1
653; SSE-NEXT:    mulpd %xmm2, %xmm4
654; SSE-NEXT:    mulpd %xmm1, %xmm4
655; SSE-NEXT:    movapd %xmm4, %xmm1
656; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
657; SSE-NEXT:    mulsd %xmm4, %xmm1
658; SSE-NEXT:    mulsd %xmm1, %xmm0
659; SSE-NEXT:    retq
660;
661; AVX-LABEL: test_v16f64:
662; AVX:       # %bb.0:
663; AVX-NEXT:    vmulpd %ymm4, %ymm2, %ymm2
664; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
665; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
666; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
667; AVX-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
668; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
669; AVX-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
670; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
671; AVX-NEXT:    vzeroupper
672; AVX-NEXT:    retq
673;
674; AVX512-LABEL: test_v16f64:
675; AVX512:       # %bb.0:
676; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
677; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
678; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm1
679; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
680; AVX512-NEXT:    vmulpd %xmm2, %xmm1, %xmm1
681; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
682; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
683; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
684; AVX512-NEXT:    vzeroupper
685; AVX512-NEXT:    retq
686  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
687  ret double %1
688}
689
690;
691; vXf64 (one)
692;
693
694define double @test_v2f64_zero(<2 x double> %a0) {
695; SSE-LABEL: test_v2f64_zero:
696; SSE:       # %bb.0:
697; SSE-NEXT:    movapd %xmm0, %xmm1
698; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
699; SSE-NEXT:    mulsd %xmm1, %xmm0
700; SSE-NEXT:    retq
701;
702; AVX-LABEL: test_v2f64_zero:
703; AVX:       # %bb.0:
704; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
705; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
706; AVX-NEXT:    retq
707;
708; AVX512-LABEL: test_v2f64_zero:
709; AVX512:       # %bb.0:
710; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
711; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
712; AVX512-NEXT:    retq
713  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
714  ret double %1
715}
716
717define double @test_v4f64_zero(<4 x double> %a0) {
718; SSE-LABEL: test_v4f64_zero:
719; SSE:       # %bb.0:
720; SSE-NEXT:    mulpd %xmm1, %xmm0
721; SSE-NEXT:    movapd %xmm0, %xmm1
722; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
723; SSE-NEXT:    mulsd %xmm1, %xmm0
724; SSE-NEXT:    retq
725;
726; AVX-LABEL: test_v4f64_zero:
727; AVX:       # %bb.0:
728; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
729; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
730; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
731; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
732; AVX-NEXT:    vzeroupper
733; AVX-NEXT:    retq
734;
735; AVX512-LABEL: test_v4f64_zero:
736; AVX512:       # %bb.0:
737; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
738; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
739; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
740; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
741; AVX512-NEXT:    vzeroupper
742; AVX512-NEXT:    retq
743  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
744  ret double %1
745}
746
747define double @test_v8f64_zero(<8 x double> %a0) {
748; SSE-LABEL: test_v8f64_zero:
749; SSE:       # %bb.0:
750; SSE-NEXT:    mulpd %xmm3, %xmm1
751; SSE-NEXT:    mulpd %xmm2, %xmm0
752; SSE-NEXT:    mulpd %xmm1, %xmm0
753; SSE-NEXT:    movapd %xmm0, %xmm1
754; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
755; SSE-NEXT:    mulsd %xmm1, %xmm0
756; SSE-NEXT:    retq
757;
758; AVX-LABEL: test_v8f64_zero:
759; AVX:       # %bb.0:
760; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
761; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
762; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
763; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
764; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
765; AVX-NEXT:    vzeroupper
766; AVX-NEXT:    retq
767;
768; AVX512-LABEL: test_v8f64_zero:
769; AVX512:       # %bb.0:
770; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
771; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
772; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
773; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
774; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
775; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
776; AVX512-NEXT:    vzeroupper
777; AVX512-NEXT:    retq
778  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
779  ret double %1
780}
781
782define double @test_v16f64_zero(<16 x double> %a0) {
783; SSE-LABEL: test_v16f64_zero:
784; SSE:       # %bb.0:
785; SSE-NEXT:    mulpd %xmm6, %xmm2
786; SSE-NEXT:    mulpd %xmm4, %xmm0
787; SSE-NEXT:    mulpd %xmm2, %xmm0
788; SSE-NEXT:    mulpd %xmm7, %xmm3
789; SSE-NEXT:    mulpd %xmm5, %xmm1
790; SSE-NEXT:    mulpd %xmm3, %xmm1
791; SSE-NEXT:    mulpd %xmm1, %xmm0
792; SSE-NEXT:    movapd %xmm0, %xmm1
793; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
794; SSE-NEXT:    mulsd %xmm1, %xmm0
795; SSE-NEXT:    retq
796;
797; AVX-LABEL: test_v16f64_zero:
798; AVX:       # %bb.0:
799; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
800; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
801; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
802; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
803; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
804; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
805; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
806; AVX-NEXT:    vzeroupper
807; AVX-NEXT:    retq
808;
809; AVX512-LABEL: test_v16f64_zero:
810; AVX512:       # %bb.0:
811; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
812; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
813; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
814; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
815; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
816; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
817; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
818; AVX512-NEXT:    vzeroupper
819; AVX512-NEXT:    retq
820  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
821  ret double %1
822}
823
824;
825; vXf64 (undef)
826;
827
828define double @test_v2f64_undef(<2 x double> %a0) {
829; SSE-LABEL: test_v2f64_undef:
830; SSE:       # %bb.0:
831; SSE-NEXT:    movapd %xmm0, %xmm1
832; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
833; SSE-NEXT:    mulsd %xmm1, %xmm0
834; SSE-NEXT:    retq
835;
836; AVX-LABEL: test_v2f64_undef:
837; AVX:       # %bb.0:
838; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
839; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
840; AVX-NEXT:    retq
841;
842; AVX512-LABEL: test_v2f64_undef:
843; AVX512:       # %bb.0:
844; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
845; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
846; AVX512-NEXT:    retq
847  %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
848  ret double %1
849}
850
851define double @test_v4f64_undef(<4 x double> %a0) {
852; SSE-LABEL: test_v4f64_undef:
853; SSE:       # %bb.0:
854; SSE-NEXT:    mulpd %xmm1, %xmm0
855; SSE-NEXT:    movapd %xmm0, %xmm1
856; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
857; SSE-NEXT:    mulsd %xmm1, %xmm0
858; SSE-NEXT:    retq
859;
860; AVX-LABEL: test_v4f64_undef:
861; AVX:       # %bb.0:
862; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
863; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
864; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
865; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
866; AVX-NEXT:    vzeroupper
867; AVX-NEXT:    retq
868;
869; AVX512-LABEL: test_v4f64_undef:
870; AVX512:       # %bb.0:
871; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
872; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
873; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
874; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
875; AVX512-NEXT:    vzeroupper
876; AVX512-NEXT:    retq
877  %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
878  ret double %1
879}
880
881define double @test_v8f64_undef(<8 x double> %a0) {
882; SSE-LABEL: test_v8f64_undef:
883; SSE:       # %bb.0:
884; SSE-NEXT:    mulpd %xmm3, %xmm1
885; SSE-NEXT:    mulpd %xmm2, %xmm0
886; SSE-NEXT:    mulpd %xmm1, %xmm0
887; SSE-NEXT:    movapd %xmm0, %xmm1
888; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
889; SSE-NEXT:    mulsd %xmm1, %xmm0
890; SSE-NEXT:    retq
891;
892; AVX-LABEL: test_v8f64_undef:
893; AVX:       # %bb.0:
894; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
895; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
896; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
897; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
898; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
899; AVX-NEXT:    vzeroupper
900; AVX-NEXT:    retq
901;
902; AVX512-LABEL: test_v8f64_undef:
903; AVX512:       # %bb.0:
904; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
905; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
906; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
907; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
908; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
909; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
910; AVX512-NEXT:    vzeroupper
911; AVX512-NEXT:    retq
912  %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
913  ret double %1
914}
915
916define double @test_v16f64_undef(<16 x double> %a0) {
917; SSE-LABEL: test_v16f64_undef:
918; SSE:       # %bb.0:
919; SSE-NEXT:    mulpd %xmm6, %xmm2
920; SSE-NEXT:    mulpd %xmm4, %xmm0
921; SSE-NEXT:    mulpd %xmm2, %xmm0
922; SSE-NEXT:    mulpd %xmm7, %xmm3
923; SSE-NEXT:    mulpd %xmm5, %xmm1
924; SSE-NEXT:    mulpd %xmm3, %xmm1
925; SSE-NEXT:    mulpd %xmm1, %xmm0
926; SSE-NEXT:    movapd %xmm0, %xmm1
927; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
928; SSE-NEXT:    mulsd %xmm1, %xmm0
929; SSE-NEXT:    retq
930;
931; AVX-LABEL: test_v16f64_undef:
932; AVX:       # %bb.0:
933; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
934; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
935; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
936; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
937; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
938; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
939; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
940; AVX-NEXT:    vzeroupper
941; AVX-NEXT:    retq
942;
943; AVX512-LABEL: test_v16f64_undef:
944; AVX512:       # %bb.0:
945; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
946; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
947; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
948; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
949; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
950; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
951; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
952; AVX512-NEXT:    vzeroupper
953; AVX512-NEXT:    retq
954  %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
955  ret double %1
956}
957
958declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
959declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
960declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
961declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
962
963declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
964declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
965declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
966declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
967