xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-fmul.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
8
9;
10; vXf32 (accum)
11;
12
13define float @test_v2f32(float %a0, <2 x float> %a1) {
14; SSE2-LABEL: test_v2f32:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    mulss %xmm1, %xmm0
17; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
18; SSE2-NEXT:    mulss %xmm1, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSE41-LABEL: test_v2f32:
22; SSE41:       # %bb.0:
23; SSE41-NEXT:    mulss %xmm1, %xmm0
24; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
25; SSE41-NEXT:    mulss %xmm1, %xmm0
26; SSE41-NEXT:    retq
27;
28; AVX-LABEL: test_v2f32:
29; AVX:       # %bb.0:
30; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
31; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
32; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
33; AVX-NEXT:    retq
34;
35; AVX512-LABEL: test_v2f32:
36; AVX512:       # %bb.0:
37; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
38; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
39; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
40; AVX512-NEXT:    retq
41  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
42  ret float %1
43}
44
45define float @test_v4f32(float %a0, <4 x float> %a1) {
46; SSE2-LABEL: test_v4f32:
47; SSE2:       # %bb.0:
48; SSE2-NEXT:    mulss %xmm1, %xmm0
49; SSE2-NEXT:    movaps %xmm1, %xmm2
50; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
51; SSE2-NEXT:    mulss %xmm2, %xmm0
52; SSE2-NEXT:    movaps %xmm1, %xmm2
53; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
54; SSE2-NEXT:    mulss %xmm2, %xmm0
55; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
56; SSE2-NEXT:    mulss %xmm1, %xmm0
57; SSE2-NEXT:    retq
58;
59; SSE41-LABEL: test_v4f32:
60; SSE41:       # %bb.0:
61; SSE41-NEXT:    mulss %xmm1, %xmm0
62; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
63; SSE41-NEXT:    mulss %xmm2, %xmm0
64; SSE41-NEXT:    movaps %xmm1, %xmm2
65; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
66; SSE41-NEXT:    mulss %xmm2, %xmm0
67; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
68; SSE41-NEXT:    mulss %xmm1, %xmm0
69; SSE41-NEXT:    retq
70;
71; AVX-LABEL: test_v4f32:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
75; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
76; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
77; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
78; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
79; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
80; AVX-NEXT:    retq
81;
82; AVX512-LABEL: test_v4f32:
83; AVX512:       # %bb.0:
84; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
85; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
86; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
87; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
88; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
89; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
90; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
91; AVX512-NEXT:    retq
92  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
93  ret float %1
94}
95
96define float @test_v8f32(float %a0, <8 x float> %a1) {
97; SSE2-LABEL: test_v8f32:
98; SSE2:       # %bb.0:
99; SSE2-NEXT:    mulss %xmm1, %xmm0
100; SSE2-NEXT:    movaps %xmm1, %xmm3
101; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
102; SSE2-NEXT:    mulss %xmm3, %xmm0
103; SSE2-NEXT:    movaps %xmm1, %xmm3
104; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
105; SSE2-NEXT:    mulss %xmm3, %xmm0
106; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
107; SSE2-NEXT:    mulss %xmm1, %xmm0
108; SSE2-NEXT:    mulss %xmm2, %xmm0
109; SSE2-NEXT:    movaps %xmm2, %xmm1
110; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
111; SSE2-NEXT:    mulss %xmm1, %xmm0
112; SSE2-NEXT:    movaps %xmm2, %xmm1
113; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
114; SSE2-NEXT:    mulss %xmm1, %xmm0
115; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
116; SSE2-NEXT:    mulss %xmm2, %xmm0
117; SSE2-NEXT:    retq
118;
119; SSE41-LABEL: test_v8f32:
120; SSE41:       # %bb.0:
121; SSE41-NEXT:    mulss %xmm1, %xmm0
122; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
123; SSE41-NEXT:    mulss %xmm3, %xmm0
124; SSE41-NEXT:    movaps %xmm1, %xmm3
125; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
126; SSE41-NEXT:    mulss %xmm3, %xmm0
127; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
128; SSE41-NEXT:    mulss %xmm1, %xmm0
129; SSE41-NEXT:    mulss %xmm2, %xmm0
130; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
131; SSE41-NEXT:    mulss %xmm1, %xmm0
132; SSE41-NEXT:    movaps %xmm2, %xmm1
133; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
134; SSE41-NEXT:    mulss %xmm1, %xmm0
135; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
136; SSE41-NEXT:    mulss %xmm2, %xmm0
137; SSE41-NEXT:    retq
138;
139; AVX-LABEL: test_v8f32:
140; AVX:       # %bb.0:
141; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
142; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
143; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
144; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
145; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
146; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
147; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
148; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
149; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
150; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
151; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
152; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
153; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
154; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
155; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
156; AVX-NEXT:    vzeroupper
157; AVX-NEXT:    retq
158;
159; AVX512-LABEL: test_v8f32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
162; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
163; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
164; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
165; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
166; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
167; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
168; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
169; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
170; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
172; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
173; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
174; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
175; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
176; AVX512-NEXT:    vzeroupper
177; AVX512-NEXT:    retq
178  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
179  ret float %1
180}
181
182define float @test_v16f32(float %a0, <16 x float> %a1) {
183; SSE2-LABEL: test_v16f32:
184; SSE2:       # %bb.0:
185; SSE2-NEXT:    mulss %xmm1, %xmm0
186; SSE2-NEXT:    movaps %xmm1, %xmm5
187; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
188; SSE2-NEXT:    mulss %xmm5, %xmm0
189; SSE2-NEXT:    movaps %xmm1, %xmm5
190; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
191; SSE2-NEXT:    mulss %xmm5, %xmm0
192; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
193; SSE2-NEXT:    mulss %xmm1, %xmm0
194; SSE2-NEXT:    mulss %xmm2, %xmm0
195; SSE2-NEXT:    movaps %xmm2, %xmm1
196; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
197; SSE2-NEXT:    mulss %xmm1, %xmm0
198; SSE2-NEXT:    movaps %xmm2, %xmm1
199; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
200; SSE2-NEXT:    mulss %xmm1, %xmm0
201; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
202; SSE2-NEXT:    mulss %xmm2, %xmm0
203; SSE2-NEXT:    mulss %xmm3, %xmm0
204; SSE2-NEXT:    movaps %xmm3, %xmm1
205; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
206; SSE2-NEXT:    mulss %xmm1, %xmm0
207; SSE2-NEXT:    movaps %xmm3, %xmm1
208; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
209; SSE2-NEXT:    mulss %xmm1, %xmm0
210; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
211; SSE2-NEXT:    mulss %xmm3, %xmm0
212; SSE2-NEXT:    mulss %xmm4, %xmm0
213; SSE2-NEXT:    movaps %xmm4, %xmm1
214; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
215; SSE2-NEXT:    mulss %xmm1, %xmm0
216; SSE2-NEXT:    movaps %xmm4, %xmm1
217; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
218; SSE2-NEXT:    mulss %xmm1, %xmm0
219; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
220; SSE2-NEXT:    mulss %xmm4, %xmm0
221; SSE2-NEXT:    retq
222;
223; SSE41-LABEL: test_v16f32:
224; SSE41:       # %bb.0:
225; SSE41-NEXT:    mulss %xmm1, %xmm0
226; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
227; SSE41-NEXT:    mulss %xmm5, %xmm0
228; SSE41-NEXT:    movaps %xmm1, %xmm5
229; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
230; SSE41-NEXT:    mulss %xmm5, %xmm0
231; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
232; SSE41-NEXT:    mulss %xmm1, %xmm0
233; SSE41-NEXT:    mulss %xmm2, %xmm0
234; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
235; SSE41-NEXT:    mulss %xmm1, %xmm0
236; SSE41-NEXT:    movaps %xmm2, %xmm1
237; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
238; SSE41-NEXT:    mulss %xmm1, %xmm0
239; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
240; SSE41-NEXT:    mulss %xmm2, %xmm0
241; SSE41-NEXT:    mulss %xmm3, %xmm0
242; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
243; SSE41-NEXT:    mulss %xmm1, %xmm0
244; SSE41-NEXT:    movaps %xmm3, %xmm1
245; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
246; SSE41-NEXT:    mulss %xmm1, %xmm0
247; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
248; SSE41-NEXT:    mulss %xmm3, %xmm0
249; SSE41-NEXT:    mulss %xmm4, %xmm0
250; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
251; SSE41-NEXT:    mulss %xmm1, %xmm0
252; SSE41-NEXT:    movaps %xmm4, %xmm1
253; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
254; SSE41-NEXT:    mulss %xmm1, %xmm0
255; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
256; SSE41-NEXT:    mulss %xmm4, %xmm0
257; SSE41-NEXT:    retq
258;
259; AVX-LABEL: test_v16f32:
260; AVX:       # %bb.0:
261; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
262; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
263; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
264; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
265; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
266; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
267; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
268; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
269; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
270; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
271; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
272; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
273; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
274; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
275; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
276; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
277; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
278; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
279; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
280; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
281; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
282; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
283; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
284; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
285; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
286; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
287; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
288; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
289; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
290; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
291; AVX-NEXT:    vzeroupper
292; AVX-NEXT:    retq
293;
294; AVX512-LABEL: test_v16f32:
295; AVX512:       # %bb.0:
296; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
297; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
298; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
299; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
300; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
301; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
302; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
303; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
304; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
305; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
306; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
307; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
308; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
309; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
310; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
311; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
312; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
313; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
314; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
315; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
316; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
317; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
318; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
319; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
320; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
321; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
322; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
323; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
324; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
325; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
326; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
327; AVX512-NEXT:    vzeroupper
328; AVX512-NEXT:    retq
329  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
330  ret float %1
331}
332
333;
334; vXf32 (one)
335;
336
337define float @test_v2f32_one(<2 x float> %a0) {
338; SSE2-LABEL: test_v2f32_one:
339; SSE2:       # %bb.0:
340; SSE2-NEXT:    movaps %xmm0, %xmm1
341; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
342; SSE2-NEXT:    mulss %xmm1, %xmm0
343; SSE2-NEXT:    retq
344;
345; SSE41-LABEL: test_v2f32_one:
346; SSE41:       # %bb.0:
347; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
348; SSE41-NEXT:    mulss %xmm1, %xmm0
349; SSE41-NEXT:    retq
350;
351; AVX-LABEL: test_v2f32_one:
352; AVX:       # %bb.0:
353; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
354; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
355; AVX-NEXT:    retq
356;
357; AVX512-LABEL: test_v2f32_one:
358; AVX512:       # %bb.0:
359; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
360; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
361; AVX512-NEXT:    retq
362  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
363  ret float %1
364}
365
366define float @test_v4f32_one(<4 x float> %a0) {
367; SSE2-LABEL: test_v4f32_one:
368; SSE2:       # %bb.0:
369; SSE2-NEXT:    movaps %xmm0, %xmm1
370; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
371; SSE2-NEXT:    mulss %xmm0, %xmm1
372; SSE2-NEXT:    movaps %xmm0, %xmm2
373; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
374; SSE2-NEXT:    mulss %xmm1, %xmm2
375; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
376; SSE2-NEXT:    mulss %xmm2, %xmm0
377; SSE2-NEXT:    retq
378;
379; SSE41-LABEL: test_v4f32_one:
380; SSE41:       # %bb.0:
381; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
382; SSE41-NEXT:    mulss %xmm0, %xmm1
383; SSE41-NEXT:    movaps %xmm0, %xmm2
384; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
385; SSE41-NEXT:    mulss %xmm1, %xmm2
386; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
387; SSE41-NEXT:    mulss %xmm2, %xmm0
388; SSE41-NEXT:    retq
389;
390; AVX-LABEL: test_v4f32_one:
391; AVX:       # %bb.0:
392; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
393; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
394; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
395; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
396; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
397; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
398; AVX-NEXT:    retq
399;
400; AVX512-LABEL: test_v4f32_one:
401; AVX512:       # %bb.0:
402; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
403; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
404; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
405; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
406; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
407; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
408; AVX512-NEXT:    retq
409  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
410  ret float %1
411}
412
413define float @test_v8f32_one(<8 x float> %a0) {
414; SSE2-LABEL: test_v8f32_one:
415; SSE2:       # %bb.0:
416; SSE2-NEXT:    movaps %xmm0, %xmm2
417; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
418; SSE2-NEXT:    mulss %xmm0, %xmm2
419; SSE2-NEXT:    movaps %xmm0, %xmm3
420; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
421; SSE2-NEXT:    mulss %xmm2, %xmm3
422; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
423; SSE2-NEXT:    mulss %xmm3, %xmm0
424; SSE2-NEXT:    mulss %xmm1, %xmm0
425; SSE2-NEXT:    movaps %xmm1, %xmm2
426; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
427; SSE2-NEXT:    mulss %xmm2, %xmm0
428; SSE2-NEXT:    movaps %xmm1, %xmm2
429; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
430; SSE2-NEXT:    mulss %xmm2, %xmm0
431; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
432; SSE2-NEXT:    mulss %xmm1, %xmm0
433; SSE2-NEXT:    retq
434;
435; SSE41-LABEL: test_v8f32_one:
436; SSE41:       # %bb.0:
437; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
438; SSE41-NEXT:    mulss %xmm0, %xmm2
439; SSE41-NEXT:    movaps %xmm0, %xmm3
440; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
441; SSE41-NEXT:    mulss %xmm2, %xmm3
442; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
443; SSE41-NEXT:    mulss %xmm3, %xmm0
444; SSE41-NEXT:    mulss %xmm1, %xmm0
445; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
446; SSE41-NEXT:    mulss %xmm2, %xmm0
447; SSE41-NEXT:    movaps %xmm1, %xmm2
448; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
449; SSE41-NEXT:    mulss %xmm2, %xmm0
450; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
451; SSE41-NEXT:    mulss %xmm1, %xmm0
452; SSE41-NEXT:    retq
453;
454; AVX-LABEL: test_v8f32_one:
455; AVX:       # %bb.0:
456; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
457; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
458; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
459; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
460; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
461; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
462; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
463; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
464; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
465; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
466; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
467; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
468; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
469; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
470; AVX-NEXT:    vzeroupper
471; AVX-NEXT:    retq
472;
473; AVX512-LABEL: test_v8f32_one:
474; AVX512:       # %bb.0:
475; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
476; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
477; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
478; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
479; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
480; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
481; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
482; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
483; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
484; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
485; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
486; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
487; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
488; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
489; AVX512-NEXT:    vzeroupper
490; AVX512-NEXT:    retq
491  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
492  ret float %1
493}
494
495define float @test_v16f32_one(<16 x float> %a0) {
496; SSE2-LABEL: test_v16f32_one:
497; SSE2:       # %bb.0:
498; SSE2-NEXT:    movaps %xmm0, %xmm4
499; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
500; SSE2-NEXT:    mulss %xmm0, %xmm4
501; SSE2-NEXT:    movaps %xmm0, %xmm5
502; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
503; SSE2-NEXT:    mulss %xmm4, %xmm5
504; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
505; SSE2-NEXT:    mulss %xmm5, %xmm0
506; SSE2-NEXT:    mulss %xmm1, %xmm0
507; SSE2-NEXT:    movaps %xmm1, %xmm4
508; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
509; SSE2-NEXT:    mulss %xmm4, %xmm0
510; SSE2-NEXT:    movaps %xmm1, %xmm4
511; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
512; SSE2-NEXT:    mulss %xmm4, %xmm0
513; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
514; SSE2-NEXT:    mulss %xmm1, %xmm0
515; SSE2-NEXT:    mulss %xmm2, %xmm0
516; SSE2-NEXT:    movaps %xmm2, %xmm1
517; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
518; SSE2-NEXT:    mulss %xmm1, %xmm0
519; SSE2-NEXT:    movaps %xmm2, %xmm1
520; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
521; SSE2-NEXT:    mulss %xmm1, %xmm0
522; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
523; SSE2-NEXT:    mulss %xmm2, %xmm0
524; SSE2-NEXT:    mulss %xmm3, %xmm0
525; SSE2-NEXT:    movaps %xmm3, %xmm1
526; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
527; SSE2-NEXT:    mulss %xmm1, %xmm0
528; SSE2-NEXT:    movaps %xmm3, %xmm1
529; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
530; SSE2-NEXT:    mulss %xmm1, %xmm0
531; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
532; SSE2-NEXT:    mulss %xmm3, %xmm0
533; SSE2-NEXT:    retq
534;
535; SSE41-LABEL: test_v16f32_one:
536; SSE41:       # %bb.0:
537; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
538; SSE41-NEXT:    mulss %xmm0, %xmm4
539; SSE41-NEXT:    movaps %xmm0, %xmm5
540; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
541; SSE41-NEXT:    mulss %xmm4, %xmm5
542; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
543; SSE41-NEXT:    mulss %xmm5, %xmm0
544; SSE41-NEXT:    mulss %xmm1, %xmm0
545; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
546; SSE41-NEXT:    mulss %xmm4, %xmm0
547; SSE41-NEXT:    movaps %xmm1, %xmm4
548; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
549; SSE41-NEXT:    mulss %xmm4, %xmm0
550; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
551; SSE41-NEXT:    mulss %xmm1, %xmm0
552; SSE41-NEXT:    mulss %xmm2, %xmm0
553; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
554; SSE41-NEXT:    mulss %xmm1, %xmm0
555; SSE41-NEXT:    movaps %xmm2, %xmm1
556; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
557; SSE41-NEXT:    mulss %xmm1, %xmm0
558; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
559; SSE41-NEXT:    mulss %xmm2, %xmm0
560; SSE41-NEXT:    mulss %xmm3, %xmm0
561; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
562; SSE41-NEXT:    mulss %xmm1, %xmm0
563; SSE41-NEXT:    movaps %xmm3, %xmm1
564; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
565; SSE41-NEXT:    mulss %xmm1, %xmm0
566; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
567; SSE41-NEXT:    mulss %xmm3, %xmm0
568; SSE41-NEXT:    retq
569;
570; AVX-LABEL: test_v16f32_one:
571; AVX:       # %bb.0:
572; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
573; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm2
574; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
575; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
576; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
577; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
578; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
579; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
580; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
581; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
582; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
583; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
584; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
585; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
586; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
587; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
588; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
589; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
590; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
591; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
592; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
593; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
594; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
595; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
596; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
597; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
598; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
599; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
600; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
601; AVX-NEXT:    vzeroupper
602; AVX-NEXT:    retq
603;
604; AVX512-LABEL: test_v16f32_one:
605; AVX512:       # %bb.0:
606; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
607; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
608; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
609; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
610; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
611; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
612; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
613; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
614; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
615; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
616; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
617; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
618; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
619; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
620; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
621; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
622; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
623; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
624; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
625; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
626; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
627; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
628; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
629; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
630; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
631; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
632; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
633; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
634; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
635; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
636; AVX512-NEXT:    vzeroupper
637; AVX512-NEXT:    retq
638  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
639  ret float %1
640}
641
642;
643; vXf32 (undef)
644;
645
646define float @test_v2f32_undef(<2 x float> %a0) {
647; SSE2-LABEL: test_v2f32_undef:
648; SSE2:       # %bb.0:
649; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
650; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
651; SSE2-NEXT:    retq
652;
653; SSE41-LABEL: test_v2f32_undef:
654; SSE41:       # %bb.0:
655; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
656; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
657; SSE41-NEXT:    retq
658;
659; AVX-LABEL: test_v2f32_undef:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
662; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
663; AVX-NEXT:    retq
664;
665; AVX512-LABEL: test_v2f32_undef:
666; AVX512:       # %bb.0:
667; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
668; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
669; AVX512-NEXT:    retq
670  %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
671  ret float %1
672}
673
674define float @test_v4f32_undef(<4 x float> %a0) {
675; SSE2-LABEL: test_v4f32_undef:
676; SSE2:       # %bb.0:
677; SSE2-NEXT:    movaps %xmm0, %xmm1
678; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
679; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
680; SSE2-NEXT:    movaps %xmm0, %xmm2
681; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
682; SSE2-NEXT:    mulss %xmm1, %xmm2
683; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
684; SSE2-NEXT:    mulss %xmm2, %xmm0
685; SSE2-NEXT:    retq
686;
687; SSE41-LABEL: test_v4f32_undef:
688; SSE41:       # %bb.0:
689; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
690; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
691; SSE41-NEXT:    movaps %xmm0, %xmm2
692; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
693; SSE41-NEXT:    mulss %xmm1, %xmm2
694; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
695; SSE41-NEXT:    mulss %xmm2, %xmm0
696; SSE41-NEXT:    retq
697;
698; AVX-LABEL: test_v4f32_undef:
699; AVX:       # %bb.0:
700; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
701; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
702; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
703; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
704; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
705; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
706; AVX-NEXT:    retq
707;
708; AVX512-LABEL: test_v4f32_undef:
709; AVX512:       # %bb.0:
710; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
711; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
712; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
713; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
714; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
715; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
716; AVX512-NEXT:    retq
717  %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
718  ret float %1
719}
720
721define float @test_v8f32_undef(<8 x float> %a0) {
722; SSE2-LABEL: test_v8f32_undef:
723; SSE2:       # %bb.0:
724; SSE2-NEXT:    movaps %xmm0, %xmm2
725; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
726; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
727; SSE2-NEXT:    movaps %xmm0, %xmm3
728; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
729; SSE2-NEXT:    mulss %xmm2, %xmm3
730; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
731; SSE2-NEXT:    mulss %xmm3, %xmm0
732; SSE2-NEXT:    mulss %xmm1, %xmm0
733; SSE2-NEXT:    movaps %xmm1, %xmm2
734; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
735; SSE2-NEXT:    mulss %xmm2, %xmm0
736; SSE2-NEXT:    movaps %xmm1, %xmm2
737; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
738; SSE2-NEXT:    mulss %xmm2, %xmm0
739; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
740; SSE2-NEXT:    mulss %xmm1, %xmm0
741; SSE2-NEXT:    retq
742;
743; SSE41-LABEL: test_v8f32_undef:
744; SSE41:       # %bb.0:
745; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
746; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
747; SSE41-NEXT:    movaps %xmm0, %xmm3
748; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
749; SSE41-NEXT:    mulss %xmm2, %xmm3
750; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
751; SSE41-NEXT:    mulss %xmm3, %xmm0
752; SSE41-NEXT:    mulss %xmm1, %xmm0
753; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
754; SSE41-NEXT:    mulss %xmm2, %xmm0
755; SSE41-NEXT:    movaps %xmm1, %xmm2
756; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
757; SSE41-NEXT:    mulss %xmm2, %xmm0
758; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
759; SSE41-NEXT:    mulss %xmm1, %xmm0
760; SSE41-NEXT:    retq
761;
762; AVX-LABEL: test_v8f32_undef:
763; AVX:       # %bb.0:
764; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
765; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
766; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
767; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
768; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
769; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
770; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
771; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
772; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
773; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
774; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
775; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
776; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
777; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
778; AVX-NEXT:    vzeroupper
779; AVX-NEXT:    retq
780;
781; AVX512-LABEL: test_v8f32_undef:
782; AVX512:       # %bb.0:
783; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
784; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
785; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
786; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
787; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
788; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
789; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
790; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
791; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
792; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
793; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
794; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
795; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
796; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
797; AVX512-NEXT:    vzeroupper
798; AVX512-NEXT:    retq
799  %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
800  ret float %1
801}
802
803define float @test_v16f32_undef(<16 x float> %a0) {
804; SSE2-LABEL: test_v16f32_undef:
805; SSE2:       # %bb.0:
806; SSE2-NEXT:    movaps %xmm0, %xmm4
807; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
808; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
809; SSE2-NEXT:    movaps %xmm0, %xmm5
810; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
811; SSE2-NEXT:    mulss %xmm4, %xmm5
812; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
813; SSE2-NEXT:    mulss %xmm5, %xmm0
814; SSE2-NEXT:    mulss %xmm1, %xmm0
815; SSE2-NEXT:    movaps %xmm1, %xmm4
816; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
817; SSE2-NEXT:    mulss %xmm4, %xmm0
818; SSE2-NEXT:    movaps %xmm1, %xmm4
819; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
820; SSE2-NEXT:    mulss %xmm4, %xmm0
821; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
822; SSE2-NEXT:    mulss %xmm1, %xmm0
823; SSE2-NEXT:    mulss %xmm2, %xmm0
824; SSE2-NEXT:    movaps %xmm2, %xmm1
825; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
826; SSE2-NEXT:    mulss %xmm1, %xmm0
827; SSE2-NEXT:    movaps %xmm2, %xmm1
828; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
829; SSE2-NEXT:    mulss %xmm1, %xmm0
830; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
831; SSE2-NEXT:    mulss %xmm2, %xmm0
832; SSE2-NEXT:    mulss %xmm3, %xmm0
833; SSE2-NEXT:    movaps %xmm3, %xmm1
834; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1]
835; SSE2-NEXT:    mulss %xmm1, %xmm0
836; SSE2-NEXT:    movaps %xmm3, %xmm1
837; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
838; SSE2-NEXT:    mulss %xmm1, %xmm0
839; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
840; SSE2-NEXT:    mulss %xmm3, %xmm0
841; SSE2-NEXT:    retq
842;
843; SSE41-LABEL: test_v16f32_undef:
844; SSE41:       # %bb.0:
845; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
846; SSE41-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
847; SSE41-NEXT:    movaps %xmm0, %xmm5
848; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
849; SSE41-NEXT:    mulss %xmm4, %xmm5
850; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
851; SSE41-NEXT:    mulss %xmm5, %xmm0
852; SSE41-NEXT:    mulss %xmm1, %xmm0
853; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
854; SSE41-NEXT:    mulss %xmm4, %xmm0
855; SSE41-NEXT:    movaps %xmm1, %xmm4
856; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
857; SSE41-NEXT:    mulss %xmm4, %xmm0
858; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
859; SSE41-NEXT:    mulss %xmm1, %xmm0
860; SSE41-NEXT:    mulss %xmm2, %xmm0
861; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
862; SSE41-NEXT:    mulss %xmm1, %xmm0
863; SSE41-NEXT:    movaps %xmm2, %xmm1
864; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
865; SSE41-NEXT:    mulss %xmm1, %xmm0
866; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
867; SSE41-NEXT:    mulss %xmm2, %xmm0
868; SSE41-NEXT:    mulss %xmm3, %xmm0
869; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
870; SSE41-NEXT:    mulss %xmm1, %xmm0
871; SSE41-NEXT:    movaps %xmm3, %xmm1
872; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
873; SSE41-NEXT:    mulss %xmm1, %xmm0
874; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
875; SSE41-NEXT:    mulss %xmm3, %xmm0
876; SSE41-NEXT:    retq
877;
878; AVX-LABEL: test_v16f32_undef:
879; AVX:       # %bb.0:
880; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
881; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
882; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
883; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
884; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
885; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
886; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
887; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
888; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
889; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
890; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
891; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
892; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
893; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
894; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
895; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
896; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
897; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
898; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
899; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
900; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
901; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
902; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
903; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
904; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
905; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
906; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
907; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
908; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
909; AVX-NEXT:    vzeroupper
910; AVX-NEXT:    retq
911;
912; AVX512-LABEL: test_v16f32_undef:
913; AVX512:       # %bb.0:
914; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
915; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
916; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
917; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
918; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
919; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
920; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
921; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
922; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
923; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
924; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
925; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
926; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
927; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
928; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
929; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
930; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
931; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
932; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
933; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
934; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
935; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
936; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
937; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
938; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
939; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
940; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
941; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
942; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
943; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
944; AVX512-NEXT:    vzeroupper
945; AVX512-NEXT:    retq
946  %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
947  ret float %1
948}
949
950;
951; vXf64 (accum)
952;
953
954define double @test_v2f64(double %a0, <2 x double> %a1) {
955; SSE-LABEL: test_v2f64:
956; SSE:       # %bb.0:
957; SSE-NEXT:    mulsd %xmm1, %xmm0
958; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
959; SSE-NEXT:    mulsd %xmm1, %xmm0
960; SSE-NEXT:    retq
961;
962; AVX-LABEL: test_v2f64:
963; AVX:       # %bb.0:
964; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
965; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
966; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
967; AVX-NEXT:    retq
968;
969; AVX512-LABEL: test_v2f64:
970; AVX512:       # %bb.0:
971; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
972; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
973; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
974; AVX512-NEXT:    retq
975  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
976  ret double %1
977}
978
979define double @test_v4f64(double %a0, <4 x double> %a1) {
980; SSE-LABEL: test_v4f64:
981; SSE:       # %bb.0:
982; SSE-NEXT:    mulsd %xmm1, %xmm0
983; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
984; SSE-NEXT:    mulsd %xmm1, %xmm0
985; SSE-NEXT:    mulsd %xmm2, %xmm0
986; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
987; SSE-NEXT:    mulsd %xmm2, %xmm0
988; SSE-NEXT:    retq
989;
990; AVX-LABEL: test_v4f64:
991; AVX:       # %bb.0:
992; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
993; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
994; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
995; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
996; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
997; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
998; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
999; AVX-NEXT:    vzeroupper
1000; AVX-NEXT:    retq
1001;
1002; AVX512-LABEL: test_v4f64:
1003; AVX512:       # %bb.0:
1004; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1005; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1006; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1007; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
1008; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1009; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1010; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1011; AVX512-NEXT:    vzeroupper
1012; AVX512-NEXT:    retq
1013  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
1014  ret double %1
1015}
1016
1017define double @test_v8f64(double %a0, <8 x double> %a1) {
1018; SSE-LABEL: test_v8f64:
1019; SSE:       # %bb.0:
1020; SSE-NEXT:    mulsd %xmm1, %xmm0
1021; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1022; SSE-NEXT:    mulsd %xmm1, %xmm0
1023; SSE-NEXT:    mulsd %xmm2, %xmm0
1024; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1025; SSE-NEXT:    mulsd %xmm2, %xmm0
1026; SSE-NEXT:    mulsd %xmm3, %xmm0
1027; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1028; SSE-NEXT:    mulsd %xmm3, %xmm0
1029; SSE-NEXT:    mulsd %xmm4, %xmm0
1030; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1031; SSE-NEXT:    mulsd %xmm4, %xmm0
1032; SSE-NEXT:    retq
1033;
1034; AVX-LABEL: test_v8f64:
1035; AVX:       # %bb.0:
1036; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1037; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
1038; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1039; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1040; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1041; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1042; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1043; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1044; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1045; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1046; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1047; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1048; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1049; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1050; AVX-NEXT:    vzeroupper
1051; AVX-NEXT:    retq
1052;
1053; AVX512-LABEL: test_v8f64:
1054; AVX512:       # %bb.0:
1055; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1056; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1057; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1058; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1059; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1060; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1061; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1062; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1063; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1064; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1065; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1066; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1067; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1068; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1069; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1070; AVX512-NEXT:    vzeroupper
1071; AVX512-NEXT:    retq
1072  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
1073  ret double %1
1074}
1075
1076define double @test_v16f64(double %a0, <16 x double> %a1) {
1077; SSE2-LABEL: test_v16f64:
1078; SSE2:       # %bb.0:
1079; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1080; SSE2-NEXT:    mulsd %xmm1, %xmm0
1081; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1082; SSE2-NEXT:    mulsd %xmm1, %xmm0
1083; SSE2-NEXT:    mulsd %xmm2, %xmm0
1084; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1085; SSE2-NEXT:    mulsd %xmm2, %xmm0
1086; SSE2-NEXT:    mulsd %xmm3, %xmm0
1087; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1088; SSE2-NEXT:    mulsd %xmm3, %xmm0
1089; SSE2-NEXT:    mulsd %xmm4, %xmm0
1090; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1091; SSE2-NEXT:    mulsd %xmm4, %xmm0
1092; SSE2-NEXT:    mulsd %xmm5, %xmm0
1093; SSE2-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1094; SSE2-NEXT:    mulsd %xmm5, %xmm0
1095; SSE2-NEXT:    mulsd %xmm6, %xmm0
1096; SSE2-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1097; SSE2-NEXT:    mulsd %xmm6, %xmm0
1098; SSE2-NEXT:    mulsd %xmm7, %xmm0
1099; SSE2-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1100; SSE2-NEXT:    mulsd %xmm7, %xmm0
1101; SSE2-NEXT:    mulsd %xmm8, %xmm0
1102; SSE2-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1103; SSE2-NEXT:    mulsd %xmm8, %xmm0
1104; SSE2-NEXT:    retq
1105;
1106; SSE41-LABEL: test_v16f64:
1107; SSE41:       # %bb.0:
1108; SSE41-NEXT:    mulsd %xmm1, %xmm0
1109; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1110; SSE41-NEXT:    mulsd %xmm1, %xmm0
1111; SSE41-NEXT:    mulsd %xmm2, %xmm0
1112; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1113; SSE41-NEXT:    mulsd %xmm2, %xmm0
1114; SSE41-NEXT:    mulsd %xmm3, %xmm0
1115; SSE41-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1116; SSE41-NEXT:    mulsd %xmm3, %xmm0
1117; SSE41-NEXT:    mulsd %xmm4, %xmm0
1118; SSE41-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1119; SSE41-NEXT:    mulsd %xmm4, %xmm0
1120; SSE41-NEXT:    mulsd %xmm5, %xmm0
1121; SSE41-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1122; SSE41-NEXT:    mulsd %xmm5, %xmm0
1123; SSE41-NEXT:    mulsd %xmm6, %xmm0
1124; SSE41-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1125; SSE41-NEXT:    mulsd %xmm6, %xmm0
1126; SSE41-NEXT:    mulsd %xmm7, %xmm0
1127; SSE41-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1128; SSE41-NEXT:    mulsd %xmm7, %xmm0
1129; SSE41-NEXT:    mulsd {{[0-9]+}}(%rsp), %xmm0
1130; SSE41-NEXT:    mulsd {{[0-9]+}}(%rsp), %xmm0
1131; SSE41-NEXT:    retq
1132;
1133; AVX-LABEL: test_v16f64:
1134; AVX:       # %bb.0:
1135; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1136; AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
1137; AVX-NEXT:    vmulsd %xmm5, %xmm0, %xmm0
1138; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1139; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1140; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1141; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1142; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1143; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1144; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1145; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1146; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1147; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1148; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1149; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1150; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1151; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1152; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1153; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1154; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1155; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1156; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1157; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
1158; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1159; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
1160; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1161; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1162; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1163; AVX-NEXT:    vzeroupper
1164; AVX-NEXT:    retq
1165;
1166; AVX512-LABEL: test_v16f64:
1167; AVX512:       # %bb.0:
1168; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1169; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
1170; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1171; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
1172; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1173; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1174; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1175; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
1176; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1177; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1178; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1179; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1180; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1181; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1182; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1183; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1184; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1185; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1186; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
1187; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1188; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1189; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1190; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
1191; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1192; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1193; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1194; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
1195; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1196; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1197; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1198; AVX512-NEXT:    vzeroupper
1199; AVX512-NEXT:    retq
1200  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
1201  ret double %1
1202}
1203
1204;
1205; vXf64 (one)
1206;
1207
1208define double @test_v2f64_one(<2 x double> %a0) {
1209; SSE-LABEL: test_v2f64_one:
1210; SSE:       # %bb.0:
1211; SSE-NEXT:    movapd %xmm0, %xmm1
1212; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1213; SSE-NEXT:    mulsd %xmm1, %xmm0
1214; SSE-NEXT:    retq
1215;
1216; AVX-LABEL: test_v2f64_one:
1217; AVX:       # %bb.0:
1218; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1219; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1220; AVX-NEXT:    retq
1221;
1222; AVX512-LABEL: test_v2f64_one:
1223; AVX512:       # %bb.0:
1224; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1225; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1226; AVX512-NEXT:    retq
1227  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
1228  ret double %1
1229}
1230
1231define double @test_v4f64_one(<4 x double> %a0) {
1232; SSE-LABEL: test_v4f64_one:
1233; SSE:       # %bb.0:
1234; SSE-NEXT:    movapd %xmm0, %xmm2
1235; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1236; SSE-NEXT:    mulsd %xmm2, %xmm0
1237; SSE-NEXT:    mulsd %xmm1, %xmm0
1238; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1239; SSE-NEXT:    mulsd %xmm1, %xmm0
1240; SSE-NEXT:    retq
1241;
1242; AVX-LABEL: test_v4f64_one:
1243; AVX:       # %bb.0:
1244; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1245; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1246; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1247; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1248; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1249; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1250; AVX-NEXT:    vzeroupper
1251; AVX-NEXT:    retq
1252;
1253; AVX512-LABEL: test_v4f64_one:
1254; AVX512:       # %bb.0:
1255; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1256; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1257; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1258; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1259; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1260; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1261; AVX512-NEXT:    vzeroupper
1262; AVX512-NEXT:    retq
1263  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
1264  ret double %1
1265}
1266
1267define double @test_v8f64_one(<8 x double> %a0) {
1268; SSE-LABEL: test_v8f64_one:
1269; SSE:       # %bb.0:
1270; SSE-NEXT:    movapd %xmm0, %xmm4
1271; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1272; SSE-NEXT:    mulsd %xmm4, %xmm0
1273; SSE-NEXT:    mulsd %xmm1, %xmm0
1274; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1275; SSE-NEXT:    mulsd %xmm1, %xmm0
1276; SSE-NEXT:    mulsd %xmm2, %xmm0
1277; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1278; SSE-NEXT:    mulsd %xmm2, %xmm0
1279; SSE-NEXT:    mulsd %xmm3, %xmm0
1280; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1281; SSE-NEXT:    mulsd %xmm3, %xmm0
1282; SSE-NEXT:    retq
1283;
1284; AVX-LABEL: test_v8f64_one:
1285; AVX:       # %bb.0:
1286; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1287; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
1288; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1289; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1290; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1291; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1292; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1293; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1294; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1295; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1296; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1297; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1298; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1299; AVX-NEXT:    vzeroupper
1300; AVX-NEXT:    retq
1301;
1302; AVX512-LABEL: test_v8f64_one:
1303; AVX512:       # %bb.0:
1304; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1305; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
1306; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1307; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1308; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1309; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1310; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1311; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1312; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1313; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1314; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1315; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1316; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1317; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1318; AVX512-NEXT:    vzeroupper
1319; AVX512-NEXT:    retq
1320  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
1321  ret double %1
1322}
1323
1324define double @test_v16f64_one(<16 x double> %a0) {
1325; SSE-LABEL: test_v16f64_one:
1326; SSE:       # %bb.0:
1327; SSE-NEXT:    movapd %xmm0, %xmm8
1328; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
1329; SSE-NEXT:    mulsd %xmm8, %xmm0
1330; SSE-NEXT:    mulsd %xmm1, %xmm0
1331; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1332; SSE-NEXT:    mulsd %xmm1, %xmm0
1333; SSE-NEXT:    mulsd %xmm2, %xmm0
1334; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1335; SSE-NEXT:    mulsd %xmm2, %xmm0
1336; SSE-NEXT:    mulsd %xmm3, %xmm0
1337; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1338; SSE-NEXT:    mulsd %xmm3, %xmm0
1339; SSE-NEXT:    mulsd %xmm4, %xmm0
1340; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1341; SSE-NEXT:    mulsd %xmm4, %xmm0
1342; SSE-NEXT:    mulsd %xmm5, %xmm0
1343; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1344; SSE-NEXT:    mulsd %xmm5, %xmm0
1345; SSE-NEXT:    mulsd %xmm6, %xmm0
1346; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1347; SSE-NEXT:    mulsd %xmm6, %xmm0
1348; SSE-NEXT:    mulsd %xmm7, %xmm0
1349; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1350; SSE-NEXT:    mulsd %xmm7, %xmm0
1351; SSE-NEXT:    retq
1352;
1353; AVX-LABEL: test_v16f64_one:
1354; AVX:       # %bb.0:
1355; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1356; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm4
1357; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1358; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
1359; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1360; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
1361; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1362; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1363; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1364; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1365; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1366; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1367; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1368; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1369; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1370; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1371; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1372; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1373; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1374; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1375; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1376; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1377; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1378; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1379; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1380; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1381; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1382; AVX-NEXT:    vzeroupper
1383; AVX-NEXT:    retq
1384;
1385; AVX512-LABEL: test_v16f64_one:
1386; AVX512:       # %bb.0:
1387; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1388; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
1389; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1390; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1391; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1392; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1393; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1394; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1395; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1396; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1397; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1398; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1399; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1400; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1401; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1402; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1403; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1404; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1405; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1406; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1407; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1408; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1409; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1410; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1411; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1412; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1413; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1414; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1415; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1416; AVX512-NEXT:    vzeroupper
1417; AVX512-NEXT:    retq
1418  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
1419  ret double %1
1420}
1421
1422;
1423; vXf64 (undef)
1424;
1425
1426define double @test_v2f64_undef(<2 x double> %a0) {
1427; SSE-LABEL: test_v2f64_undef:
1428; SSE:       # %bb.0:
1429; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1430; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1431; SSE-NEXT:    retq
1432;
1433; AVX-LABEL: test_v2f64_undef:
1434; AVX:       # %bb.0:
1435; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1436; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1437; AVX-NEXT:    retq
1438;
1439; AVX512-LABEL: test_v2f64_undef:
1440; AVX512:       # %bb.0:
1441; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1442; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1443; AVX512-NEXT:    retq
1444  %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
1445  ret double %1
1446}
1447
1448define double @test_v4f64_undef(<4 x double> %a0) {
1449; SSE-LABEL: test_v4f64_undef:
1450; SSE:       # %bb.0:
1451; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1452; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1453; SSE-NEXT:    mulsd %xmm1, %xmm0
1454; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1455; SSE-NEXT:    mulsd %xmm1, %xmm0
1456; SSE-NEXT:    retq
1457;
1458; AVX-LABEL: test_v4f64_undef:
1459; AVX:       # %bb.0:
1460; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1461; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1462; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1463; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1464; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1465; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1466; AVX-NEXT:    vzeroupper
1467; AVX-NEXT:    retq
1468;
1469; AVX512-LABEL: test_v4f64_undef:
1470; AVX512:       # %bb.0:
1471; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1472; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1473; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1474; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1475; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1476; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1477; AVX512-NEXT:    vzeroupper
1478; AVX512-NEXT:    retq
1479  %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
1480  ret double %1
1481}
1482
1483define double @test_v8f64_undef(<8 x double> %a0) {
1484; SSE-LABEL: test_v8f64_undef:
1485; SSE:       # %bb.0:
1486; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1487; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1488; SSE-NEXT:    mulsd %xmm1, %xmm0
1489; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1490; SSE-NEXT:    mulsd %xmm1, %xmm0
1491; SSE-NEXT:    mulsd %xmm2, %xmm0
1492; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1493; SSE-NEXT:    mulsd %xmm2, %xmm0
1494; SSE-NEXT:    mulsd %xmm3, %xmm0
1495; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1496; SSE-NEXT:    mulsd %xmm3, %xmm0
1497; SSE-NEXT:    retq
1498;
1499; AVX-LABEL: test_v8f64_undef:
1500; AVX:       # %bb.0:
1501; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1502; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1503; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1504; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1505; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1506; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1507; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1508; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1509; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1510; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1511; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1512; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1513; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1514; AVX-NEXT:    vzeroupper
1515; AVX-NEXT:    retq
1516;
1517; AVX512-LABEL: test_v8f64_undef:
1518; AVX512:       # %bb.0:
1519; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1520; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1521; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
1522; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1523; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1524; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1525; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
1526; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1527; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1528; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
1529; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1530; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
1531; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1532; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1533; AVX512-NEXT:    vzeroupper
1534; AVX512-NEXT:    retq
1535  %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
1536  ret double %1
1537}
1538
1539define double @test_v16f64_undef(<16 x double> %a0) {
1540; SSE-LABEL: test_v16f64_undef:
1541; SSE:       # %bb.0:
1542; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1543; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1544; SSE-NEXT:    mulsd %xmm1, %xmm0
1545; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
1546; SSE-NEXT:    mulsd %xmm1, %xmm0
1547; SSE-NEXT:    mulsd %xmm2, %xmm0
1548; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1549; SSE-NEXT:    mulsd %xmm2, %xmm0
1550; SSE-NEXT:    mulsd %xmm3, %xmm0
1551; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
1552; SSE-NEXT:    mulsd %xmm3, %xmm0
1553; SSE-NEXT:    mulsd %xmm4, %xmm0
1554; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
1555; SSE-NEXT:    mulsd %xmm4, %xmm0
1556; SSE-NEXT:    mulsd %xmm5, %xmm0
1557; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
1558; SSE-NEXT:    mulsd %xmm5, %xmm0
1559; SSE-NEXT:    mulsd %xmm6, %xmm0
1560; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
1561; SSE-NEXT:    mulsd %xmm6, %xmm0
1562; SSE-NEXT:    mulsd %xmm7, %xmm0
1563; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
1564; SSE-NEXT:    mulsd %xmm7, %xmm0
1565; SSE-NEXT:    retq
1566;
1567; AVX-LABEL: test_v16f64_undef:
1568; AVX:       # %bb.0:
1569; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1570; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
1571; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1572; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
1573; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1574; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
1575; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1576; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1577; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
1578; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1579; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1580; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1581; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1582; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1583; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1584; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1585; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
1586; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1587; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1588; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1589; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
1590; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
1591; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1592; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
1593; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1594; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1595; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1596; AVX-NEXT:    vzeroupper
1597; AVX-NEXT:    retq
1598;
1599; AVX512-LABEL: test_v16f64_undef:
1600; AVX512:       # %bb.0:
1601; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
1602; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1603; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
1604; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1605; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1606; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1607; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1608; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1609; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
1610; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
1611; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1612; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
1613; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1614; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
1615; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1616; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1617; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1618; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1619; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1620; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1621; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1622; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
1623; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1624; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
1625; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
1626; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1627; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1628; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
1629; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1630; AVX512-NEXT:    vzeroupper
1631; AVX512-NEXT:    retq
1632  %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
1633  ret double %1
1634}
1635
1636declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
1637declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
1638declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
1639declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
1640
1641declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
1642declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
1643declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
1644declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
1645