xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-mul.ll (revision 86eff6be686a1e41e13c08ebfc2db4dd4d58e7c6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQVL
10
11;
12; vXi64
13;
14
15define i64 @test_v2i64(<2 x i64> %a0) {
16; SSE-LABEL: test_v2i64:
17; SSE:       # %bb.0:
18; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
19; SSE-NEXT:    movdqa %xmm0, %xmm2
20; SSE-NEXT:    psrlq $32, %xmm2
21; SSE-NEXT:    pmuludq %xmm1, %xmm2
22; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
23; SSE-NEXT:    pmuludq %xmm0, %xmm3
24; SSE-NEXT:    paddq %xmm2, %xmm3
25; SSE-NEXT:    psllq $32, %xmm3
26; SSE-NEXT:    pmuludq %xmm1, %xmm0
27; SSE-NEXT:    paddq %xmm3, %xmm0
28; SSE-NEXT:    movq %xmm0, %rax
29; SSE-NEXT:    retq
30;
31; AVX1OR2-LABEL: test_v2i64:
32; AVX1OR2:       # %bb.0:
33; AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
34; AVX1OR2-NEXT:    vpsrlq $32, %xmm0, %xmm2
35; AVX1OR2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
36; AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
37; AVX1OR2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
38; AVX1OR2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
39; AVX1OR2-NEXT:    vpsllq $32, %xmm2, %xmm2
40; AVX1OR2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
41; AVX1OR2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
42; AVX1OR2-NEXT:    vmovq %xmm0, %rax
43; AVX1OR2-NEXT:    retq
44;
45; AVX512BW-LABEL: test_v2i64:
46; AVX512BW:       # %bb.0:
47; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
48; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
49; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
50; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
51; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
52; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
53; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
54; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
55; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
56; AVX512BW-NEXT:    vmovq %xmm0, %rax
57; AVX512BW-NEXT:    retq
58;
59; AVX512BWVL-LABEL: test_v2i64:
60; AVX512BWVL:       # %bb.0:
61; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
62; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
63; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
64; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
65; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
66; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
67; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
68; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
69; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
70; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
71; AVX512BWVL-NEXT:    retq
72;
73; AVX512DQ-LABEL: test_v2i64:
74; AVX512DQ:       # %bb.0:
75; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
76; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
77; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
78; AVX512DQ-NEXT:    vmovq %xmm0, %rax
79; AVX512DQ-NEXT:    vzeroupper
80; AVX512DQ-NEXT:    retq
81;
82; AVX512DQVL-LABEL: test_v2i64:
83; AVX512DQVL:       # %bb.0:
84; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
85; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
86; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
87; AVX512DQVL-NEXT:    retq
88  %1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a0)
89  ret i64 %1
90}
91
92define i64 @test_v4i64(<4 x i64> %a0) {
93; SSE-LABEL: test_v4i64:
94; SSE:       # %bb.0:
95; SSE-NEXT:    movdqa %xmm0, %xmm2
96; SSE-NEXT:    psrlq $32, %xmm2
97; SSE-NEXT:    pmuludq %xmm1, %xmm2
98; SSE-NEXT:    movdqa %xmm1, %xmm3
99; SSE-NEXT:    psrlq $32, %xmm3
100; SSE-NEXT:    pmuludq %xmm0, %xmm3
101; SSE-NEXT:    paddq %xmm2, %xmm3
102; SSE-NEXT:    psllq $32, %xmm3
103; SSE-NEXT:    pmuludq %xmm1, %xmm0
104; SSE-NEXT:    paddq %xmm3, %xmm0
105; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
106; SSE-NEXT:    movdqa %xmm0, %xmm2
107; SSE-NEXT:    psrlq $32, %xmm2
108; SSE-NEXT:    pmuludq %xmm1, %xmm2
109; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
110; SSE-NEXT:    pmuludq %xmm0, %xmm3
111; SSE-NEXT:    paddq %xmm2, %xmm3
112; SSE-NEXT:    psllq $32, %xmm3
113; SSE-NEXT:    pmuludq %xmm1, %xmm0
114; SSE-NEXT:    paddq %xmm3, %xmm0
115; SSE-NEXT:    movq %xmm0, %rax
116; SSE-NEXT:    retq
117;
118; AVX1-LABEL: test_v4i64:
119; AVX1:       # %bb.0:
120; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
121; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
122; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
123; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
124; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
125; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
126; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
127; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
128; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
129; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
130; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
131; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
132; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
133; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
134; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
135; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
136; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
137; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
138; AVX1-NEXT:    vmovq %xmm0, %rax
139; AVX1-NEXT:    vzeroupper
140; AVX1-NEXT:    retq
141;
142; AVX2-LABEL: test_v4i64:
143; AVX2:       # %bb.0:
144; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
145; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
146; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
147; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm3
148; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
149; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
150; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
151; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
152; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
153; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
154; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
155; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
156; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
157; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
158; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
159; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
160; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
161; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
162; AVX2-NEXT:    vmovq %xmm0, %rax
163; AVX2-NEXT:    vzeroupper
164; AVX2-NEXT:    retq
165;
166; AVX512BW-LABEL: test_v4i64:
167; AVX512BW:       # %bb.0:
168; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
169; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
170; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
171; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm3
172; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
173; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
174; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
175; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
176; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
177; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
178; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
179; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
180; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
181; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
182; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
183; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
184; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
185; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
186; AVX512BW-NEXT:    vmovq %xmm0, %rax
187; AVX512BW-NEXT:    vzeroupper
188; AVX512BW-NEXT:    retq
189;
190; AVX512BWVL-LABEL: test_v4i64:
191; AVX512BWVL:       # %bb.0:
192; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
193; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
194; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
195; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
196; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
197; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
198; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
199; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
200; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
201; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
202; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
203; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
204; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
205; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
206; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
207; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
208; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
209; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
210; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
211; AVX512BWVL-NEXT:    vzeroupper
212; AVX512BWVL-NEXT:    retq
213;
214; AVX512DQ-LABEL: test_v4i64:
215; AVX512DQ:       # %bb.0:
216; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
217; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
218; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
219; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
220; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
221; AVX512DQ-NEXT:    vmovq %xmm0, %rax
222; AVX512DQ-NEXT:    vzeroupper
223; AVX512DQ-NEXT:    retq
224;
225; AVX512DQVL-LABEL: test_v4i64:
226; AVX512DQVL:       # %bb.0:
227; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
228; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
229; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
230; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
231; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
232; AVX512DQVL-NEXT:    vzeroupper
233; AVX512DQVL-NEXT:    retq
234  %1 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %a0)
235  ret i64 %1
236}
237
238define i64 @test_v8i64(<8 x i64> %a0) {
239; SSE-LABEL: test_v8i64:
240; SSE:       # %bb.0:
241; SSE-NEXT:    movdqa %xmm1, %xmm4
242; SSE-NEXT:    psrlq $32, %xmm4
243; SSE-NEXT:    pmuludq %xmm3, %xmm4
244; SSE-NEXT:    movdqa %xmm3, %xmm5
245; SSE-NEXT:    psrlq $32, %xmm5
246; SSE-NEXT:    pmuludq %xmm1, %xmm5
247; SSE-NEXT:    paddq %xmm4, %xmm5
248; SSE-NEXT:    psllq $32, %xmm5
249; SSE-NEXT:    pmuludq %xmm3, %xmm1
250; SSE-NEXT:    paddq %xmm5, %xmm1
251; SSE-NEXT:    movdqa %xmm0, %xmm3
252; SSE-NEXT:    psrlq $32, %xmm3
253; SSE-NEXT:    pmuludq %xmm2, %xmm3
254; SSE-NEXT:    movdqa %xmm2, %xmm4
255; SSE-NEXT:    psrlq $32, %xmm4
256; SSE-NEXT:    pmuludq %xmm0, %xmm4
257; SSE-NEXT:    paddq %xmm3, %xmm4
258; SSE-NEXT:    psllq $32, %xmm4
259; SSE-NEXT:    pmuludq %xmm2, %xmm0
260; SSE-NEXT:    paddq %xmm4, %xmm0
261; SSE-NEXT:    movdqa %xmm0, %xmm2
262; SSE-NEXT:    psrlq $32, %xmm2
263; SSE-NEXT:    pmuludq %xmm1, %xmm2
264; SSE-NEXT:    movdqa %xmm1, %xmm3
265; SSE-NEXT:    psrlq $32, %xmm3
266; SSE-NEXT:    pmuludq %xmm0, %xmm3
267; SSE-NEXT:    paddq %xmm2, %xmm3
268; SSE-NEXT:    psllq $32, %xmm3
269; SSE-NEXT:    pmuludq %xmm1, %xmm0
270; SSE-NEXT:    paddq %xmm3, %xmm0
271; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
272; SSE-NEXT:    movdqa %xmm0, %xmm2
273; SSE-NEXT:    psrlq $32, %xmm2
274; SSE-NEXT:    pmuludq %xmm1, %xmm2
275; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
276; SSE-NEXT:    pmuludq %xmm0, %xmm3
277; SSE-NEXT:    paddq %xmm2, %xmm3
278; SSE-NEXT:    psllq $32, %xmm3
279; SSE-NEXT:    pmuludq %xmm1, %xmm0
280; SSE-NEXT:    paddq %xmm3, %xmm0
281; SSE-NEXT:    movq %xmm0, %rax
282; SSE-NEXT:    retq
283;
284; AVX1-LABEL: test_v8i64:
285; AVX1:       # %bb.0:
286; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
287; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
288; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
289; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm4
290; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm5
291; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
292; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
293; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
294; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
295; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
296; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
297; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm3
298; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
299; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
300; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
301; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
302; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
303; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
304; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm1
305; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
306; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm3
307; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
308; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
309; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
310; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
311; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
312; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
313; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
314; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
315; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
316; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
317; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
318; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
319; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
320; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
321; AVX1-NEXT:    vmovq %xmm0, %rax
322; AVX1-NEXT:    vzeroupper
323; AVX1-NEXT:    retq
324;
325; AVX2-LABEL: test_v8i64:
326; AVX2:       # %bb.0:
327; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
328; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
329; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
330; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
331; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
332; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
333; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
334; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
335; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
336; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
337; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
338; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm3
339; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
340; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
341; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
342; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
343; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
344; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
345; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
346; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
347; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
348; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
349; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
350; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
351; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
352; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
353; AVX2-NEXT:    vmovq %xmm0, %rax
354; AVX2-NEXT:    vzeroupper
355; AVX2-NEXT:    retq
356;
357; AVX512BW-LABEL: test_v8i64:
358; AVX512BW:       # %bb.0:
359; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
360; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
361; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
362; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
363; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
364; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
365; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
366; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
367; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
368; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
369; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
370; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
371; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm3
372; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
373; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
374; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
375; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
376; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
377; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
378; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
379; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
380; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
381; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
382; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
383; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
384; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
385; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
386; AVX512BW-NEXT:    vmovq %xmm0, %rax
387; AVX512BW-NEXT:    vzeroupper
388; AVX512BW-NEXT:    retq
389;
390; AVX512BWVL-LABEL: test_v8i64:
391; AVX512BWVL:       # %bb.0:
392; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
393; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
394; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
395; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
396; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
397; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
398; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
399; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
400; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
401; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
402; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
403; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
404; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
405; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
406; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
407; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
408; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
409; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
410; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
411; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
412; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
413; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
414; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
415; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
416; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
417; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
418; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
419; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
420; AVX512BWVL-NEXT:    vzeroupper
421; AVX512BWVL-NEXT:    retq
422;
423; AVX512DQ-LABEL: test_v8i64:
424; AVX512DQ:       # %bb.0:
425; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
426; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
427; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
428; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
429; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
430; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
431; AVX512DQ-NEXT:    vmovq %xmm0, %rax
432; AVX512DQ-NEXT:    vzeroupper
433; AVX512DQ-NEXT:    retq
434;
435; AVX512DQVL-LABEL: test_v8i64:
436; AVX512DQVL:       # %bb.0:
437; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
438; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
439; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
440; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
441; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
442; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
443; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
444; AVX512DQVL-NEXT:    vzeroupper
445; AVX512DQVL-NEXT:    retq
446  %1 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %a0)
447  ret i64 %1
448}
449
450define i64 @test_v16i64(<16 x i64> %a0) {
451; SSE-LABEL: test_v16i64:
452; SSE:       # %bb.0:
453; SSE-NEXT:    movdqa %xmm2, %xmm8
454; SSE-NEXT:    psrlq $32, %xmm8
455; SSE-NEXT:    pmuludq %xmm6, %xmm8
456; SSE-NEXT:    movdqa %xmm6, %xmm9
457; SSE-NEXT:    psrlq $32, %xmm9
458; SSE-NEXT:    pmuludq %xmm2, %xmm9
459; SSE-NEXT:    paddq %xmm8, %xmm9
460; SSE-NEXT:    psllq $32, %xmm9
461; SSE-NEXT:    pmuludq %xmm6, %xmm2
462; SSE-NEXT:    paddq %xmm9, %xmm2
463; SSE-NEXT:    movdqa %xmm0, %xmm6
464; SSE-NEXT:    psrlq $32, %xmm6
465; SSE-NEXT:    pmuludq %xmm4, %xmm6
466; SSE-NEXT:    movdqa %xmm4, %xmm8
467; SSE-NEXT:    psrlq $32, %xmm8
468; SSE-NEXT:    pmuludq %xmm0, %xmm8
469; SSE-NEXT:    paddq %xmm6, %xmm8
470; SSE-NEXT:    psllq $32, %xmm8
471; SSE-NEXT:    pmuludq %xmm4, %xmm0
472; SSE-NEXT:    paddq %xmm8, %xmm0
473; SSE-NEXT:    movdqa %xmm3, %xmm4
474; SSE-NEXT:    psrlq $32, %xmm4
475; SSE-NEXT:    pmuludq %xmm7, %xmm4
476; SSE-NEXT:    movdqa %xmm7, %xmm6
477; SSE-NEXT:    psrlq $32, %xmm6
478; SSE-NEXT:    pmuludq %xmm3, %xmm6
479; SSE-NEXT:    paddq %xmm4, %xmm6
480; SSE-NEXT:    psllq $32, %xmm6
481; SSE-NEXT:    pmuludq %xmm7, %xmm3
482; SSE-NEXT:    paddq %xmm6, %xmm3
483; SSE-NEXT:    movdqa %xmm1, %xmm4
484; SSE-NEXT:    psrlq $32, %xmm4
485; SSE-NEXT:    pmuludq %xmm5, %xmm4
486; SSE-NEXT:    movdqa %xmm5, %xmm6
487; SSE-NEXT:    psrlq $32, %xmm6
488; SSE-NEXT:    pmuludq %xmm1, %xmm6
489; SSE-NEXT:    paddq %xmm4, %xmm6
490; SSE-NEXT:    psllq $32, %xmm6
491; SSE-NEXT:    pmuludq %xmm5, %xmm1
492; SSE-NEXT:    paddq %xmm6, %xmm1
493; SSE-NEXT:    movdqa %xmm1, %xmm4
494; SSE-NEXT:    psrlq $32, %xmm4
495; SSE-NEXT:    pmuludq %xmm3, %xmm4
496; SSE-NEXT:    movdqa %xmm3, %xmm5
497; SSE-NEXT:    psrlq $32, %xmm5
498; SSE-NEXT:    pmuludq %xmm1, %xmm5
499; SSE-NEXT:    paddq %xmm4, %xmm5
500; SSE-NEXT:    psllq $32, %xmm5
501; SSE-NEXT:    pmuludq %xmm3, %xmm1
502; SSE-NEXT:    paddq %xmm5, %xmm1
503; SSE-NEXT:    movdqa %xmm0, %xmm3
504; SSE-NEXT:    psrlq $32, %xmm3
505; SSE-NEXT:    pmuludq %xmm2, %xmm3
506; SSE-NEXT:    movdqa %xmm2, %xmm4
507; SSE-NEXT:    psrlq $32, %xmm4
508; SSE-NEXT:    pmuludq %xmm0, %xmm4
509; SSE-NEXT:    paddq %xmm3, %xmm4
510; SSE-NEXT:    psllq $32, %xmm4
511; SSE-NEXT:    pmuludq %xmm2, %xmm0
512; SSE-NEXT:    paddq %xmm4, %xmm0
513; SSE-NEXT:    movdqa %xmm0, %xmm2
514; SSE-NEXT:    psrlq $32, %xmm2
515; SSE-NEXT:    pmuludq %xmm1, %xmm2
516; SSE-NEXT:    movdqa %xmm1, %xmm3
517; SSE-NEXT:    psrlq $32, %xmm3
518; SSE-NEXT:    pmuludq %xmm0, %xmm3
519; SSE-NEXT:    paddq %xmm2, %xmm3
520; SSE-NEXT:    psllq $32, %xmm3
521; SSE-NEXT:    pmuludq %xmm1, %xmm0
522; SSE-NEXT:    paddq %xmm3, %xmm0
523; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
524; SSE-NEXT:    movdqa %xmm0, %xmm2
525; SSE-NEXT:    psrlq $32, %xmm2
526; SSE-NEXT:    pmuludq %xmm1, %xmm2
527; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
528; SSE-NEXT:    pmuludq %xmm0, %xmm3
529; SSE-NEXT:    paddq %xmm2, %xmm3
530; SSE-NEXT:    psllq $32, %xmm3
531; SSE-NEXT:    pmuludq %xmm1, %xmm0
532; SSE-NEXT:    paddq %xmm3, %xmm0
533; SSE-NEXT:    movq %xmm0, %rax
534; SSE-NEXT:    retq
535;
536; AVX1-LABEL: test_v16i64:
537; AVX1:       # %bb.0:
538; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
539; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
540; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
541; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm5
542; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
543; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
544; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm5
545; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
546; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm5
547; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm5
548; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
549; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
550; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
551; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
552; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm6
553; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
554; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
555; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
556; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
557; AVX1-NEXT:    vpmuludq %xmm3, %xmm6, %xmm6
558; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm7
559; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
560; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
561; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
562; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
563; AVX1-NEXT:    vpaddq %xmm6, %xmm1, %xmm1
564; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
565; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
566; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
567; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm3
568; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
569; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
570; AVX1-NEXT:    vpaddq %xmm3, %xmm6, %xmm3
571; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
572; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
573; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
574; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
575; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
576; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
577; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
578; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
579; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
580; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
581; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
582; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm1
583; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
584; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm2
585; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
586; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
587; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
588; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm2
589; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
590; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm2
591; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
592; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
593; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
594; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
595; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
596; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
597; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
598; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
599; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
600; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
601; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
602; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
603; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
604; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
605; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
606; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
607; AVX1-NEXT:    vmovq %xmm0, %rax
608; AVX1-NEXT:    vzeroupper
609; AVX1-NEXT:    retq
610;
611; AVX2-LABEL: test_v16i64:
612; AVX2:       # %bb.0:
613; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
614; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
615; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm5
616; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm5
617; AVX2-NEXT:    vpaddq %ymm4, %ymm5, %ymm4
618; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
619; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
620; AVX2-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
621; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
622; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
623; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm4
624; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
625; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
626; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
627; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
628; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
629; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
630; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
631; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
632; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
633; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
634; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
635; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
636; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
637; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
638; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
639; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
640; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm3
641; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
642; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
643; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
644; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
645; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
646; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
647; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm2
648; AVX2-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
649; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
650; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
651; AVX2-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
652; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
653; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
654; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
655; AVX2-NEXT:    vmovq %xmm0, %rax
656; AVX2-NEXT:    vzeroupper
657; AVX2-NEXT:    retq
658;
659; AVX512BW-LABEL: test_v16i64:
660; AVX512BW:       # %bb.0:
661; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
662; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
663; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
664; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
665; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
666; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
667; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
668; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
669; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
670; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
671; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
672; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
673; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
674; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
675; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
676; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
677; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
678; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
679; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
680; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
681; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm3
682; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
683; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
684; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
685; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
686; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
687; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
688; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
689; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
690; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
691; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
692; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
693; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
694; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
695; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
696; AVX512BW-NEXT:    vmovq %xmm0, %rax
697; AVX512BW-NEXT:    vzeroupper
698; AVX512BW-NEXT:    retq
699;
700; AVX512BWVL-LABEL: test_v16i64:
701; AVX512BWVL:       # %bb.0:
702; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
703; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
704; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
705; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
706; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
707; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
708; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
709; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
710; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
711; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
712; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
713; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
714; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
715; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
716; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
717; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
718; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
719; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
720; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
721; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
722; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
723; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
724; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
725; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
726; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
727; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
728; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
729; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
730; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
731; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
732; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
733; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
734; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
735; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
736; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
737; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
738; AVX512BWVL-NEXT:    vzeroupper
739; AVX512BWVL-NEXT:    retq
740;
741; AVX512DQ-LABEL: test_v16i64:
742; AVX512DQ:       # %bb.0:
743; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
744; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
745; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
746; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
747; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
748; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
749; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
750; AVX512DQ-NEXT:    vmovq %xmm0, %rax
751; AVX512DQ-NEXT:    vzeroupper
752; AVX512DQ-NEXT:    retq
753;
754; AVX512DQVL-LABEL: test_v16i64:
755; AVX512DQVL:       # %bb.0:
756; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
757; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
758; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
759; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
760; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
761; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
762; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
763; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
764; AVX512DQVL-NEXT:    vzeroupper
765; AVX512DQVL-NEXT:    retq
766  %1 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %a0)
767  ret i64 %1
768}
769
770;
771; vXi32
772;
773
774define i32 @test_v2i32(<2 x i32> %a0) {
775; SSE2-LABEL: test_v2i32:
776; SSE2:       # %bb.0:
777; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
778; SSE2-NEXT:    pmuludq %xmm0, %xmm1
779; SSE2-NEXT:    movd %xmm1, %eax
780; SSE2-NEXT:    retq
781;
782; SSE41-LABEL: test_v2i32:
783; SSE41:       # %bb.0:
784; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
785; SSE41-NEXT:    pmulld %xmm0, %xmm1
786; SSE41-NEXT:    movd %xmm1, %eax
787; SSE41-NEXT:    retq
788;
789; AVX-LABEL: test_v2i32:
790; AVX:       # %bb.0:
791; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
792; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
793; AVX-NEXT:    vmovd %xmm0, %eax
794; AVX-NEXT:    retq
795  %1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a0)
796  ret i32 %1
797}
798
799define i32 @test_v4i32(<4 x i32> %a0) {
800; SSE2-LABEL: test_v4i32:
801; SSE2:       # %bb.0:
802; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
803; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
804; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
805; SSE2-NEXT:    pmuludq %xmm2, %xmm3
806; SSE2-NEXT:    pmuludq %xmm0, %xmm1
807; SSE2-NEXT:    pmuludq %xmm3, %xmm1
808; SSE2-NEXT:    movd %xmm1, %eax
809; SSE2-NEXT:    retq
810;
811; SSE41-LABEL: test_v4i32:
812; SSE41:       # %bb.0:
813; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
814; SSE41-NEXT:    pmulld %xmm0, %xmm1
815; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
816; SSE41-NEXT:    pmulld %xmm1, %xmm0
817; SSE41-NEXT:    movd %xmm0, %eax
818; SSE41-NEXT:    retq
819;
820; AVX-LABEL: test_v4i32:
821; AVX:       # %bb.0:
822; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
823; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
824; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
825; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
826; AVX-NEXT:    vmovd %xmm0, %eax
827; AVX-NEXT:    retq
828  %1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a0)
829  ret i32 %1
830}
831
832define i32 @test_v8i32(<8 x i32> %a0) {
833; SSE2-LABEL: test_v8i32:
834; SSE2:       # %bb.0:
835; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
836; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
837; SSE2-NEXT:    pmuludq %xmm2, %xmm3
838; SSE2-NEXT:    pmuludq %xmm1, %xmm0
839; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
840; SSE2-NEXT:    pmuludq %xmm0, %xmm1
841; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
842; SSE2-NEXT:    pmuludq %xmm3, %xmm0
843; SSE2-NEXT:    pmuludq %xmm1, %xmm0
844; SSE2-NEXT:    movd %xmm0, %eax
845; SSE2-NEXT:    retq
846;
847; SSE41-LABEL: test_v8i32:
848; SSE41:       # %bb.0:
849; SSE41-NEXT:    pmulld %xmm1, %xmm0
850; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
851; SSE41-NEXT:    pmulld %xmm0, %xmm1
852; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
853; SSE41-NEXT:    pmulld %xmm1, %xmm0
854; SSE41-NEXT:    movd %xmm0, %eax
855; SSE41-NEXT:    retq
856;
857; AVX1-LABEL: test_v8i32:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
860; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
861; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
862; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
863; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
864; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
865; AVX1-NEXT:    vmovd %xmm0, %eax
866; AVX1-NEXT:    vzeroupper
867; AVX1-NEXT:    retq
868;
869; AVX2-LABEL: test_v8i32:
870; AVX2:       # %bb.0:
871; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
872; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
873; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
874; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
875; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
876; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
877; AVX2-NEXT:    vmovd %xmm0, %eax
878; AVX2-NEXT:    vzeroupper
879; AVX2-NEXT:    retq
880;
881; AVX512-LABEL: test_v8i32:
882; AVX512:       # %bb.0:
883; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
884; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
885; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
886; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
887; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
888; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
889; AVX512-NEXT:    vmovd %xmm0, %eax
890; AVX512-NEXT:    vzeroupper
891; AVX512-NEXT:    retq
892  %1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a0)
893  ret i32 %1
894}
895
896define i32 @test_v16i32(<16 x i32> %a0) {
897; SSE2-LABEL: test_v16i32:
898; SSE2:       # %bb.0:
899; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
900; SSE2-NEXT:    pmuludq %xmm3, %xmm1
901; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
902; SSE2-NEXT:    pmuludq %xmm2, %xmm0
903; SSE2-NEXT:    pmuludq %xmm1, %xmm0
904; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
905; SSE2-NEXT:    pmuludq %xmm4, %xmm1
906; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
907; SSE2-NEXT:    pmuludq %xmm5, %xmm2
908; SSE2-NEXT:    pmuludq %xmm1, %xmm2
909; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
910; SSE2-NEXT:    pmuludq %xmm0, %xmm1
911; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
912; SSE2-NEXT:    pmuludq %xmm2, %xmm0
913; SSE2-NEXT:    pmuludq %xmm1, %xmm0
914; SSE2-NEXT:    movd %xmm0, %eax
915; SSE2-NEXT:    retq
916;
917; SSE41-LABEL: test_v16i32:
918; SSE41:       # %bb.0:
919; SSE41-NEXT:    pmulld %xmm3, %xmm1
920; SSE41-NEXT:    pmulld %xmm2, %xmm0
921; SSE41-NEXT:    pmulld %xmm1, %xmm0
922; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
923; SSE41-NEXT:    pmulld %xmm0, %xmm1
924; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
925; SSE41-NEXT:    pmulld %xmm1, %xmm0
926; SSE41-NEXT:    movd %xmm0, %eax
927; SSE41-NEXT:    retq
928;
929; AVX1-LABEL: test_v16i32:
930; AVX1:       # %bb.0:
931; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
932; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
933; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
934; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
935; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
936; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
937; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
938; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
939; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
940; AVX1-NEXT:    vmovd %xmm0, %eax
941; AVX1-NEXT:    vzeroupper
942; AVX1-NEXT:    retq
943;
944; AVX2-LABEL: test_v16i32:
945; AVX2:       # %bb.0:
946; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
947; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
948; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
949; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
950; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
951; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
952; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
953; AVX2-NEXT:    vmovd %xmm0, %eax
954; AVX2-NEXT:    vzeroupper
955; AVX2-NEXT:    retq
956;
957; AVX512-LABEL: test_v16i32:
958; AVX512:       # %bb.0:
959; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
960; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
961; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
962; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
963; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
964; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
965; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
966; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
967; AVX512-NEXT:    vmovd %xmm0, %eax
968; AVX512-NEXT:    vzeroupper
969; AVX512-NEXT:    retq
970  %1 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a0)
971  ret i32 %1
972}
973
974define i32 @test_v32i32(<32 x i32> %a0) {
975; SSE2-LABEL: test_v32i32:
976; SSE2:       # %bb.0:
977; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
978; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
979; SSE2-NEXT:    pmuludq %xmm8, %xmm9
980; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
981; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
982; SSE2-NEXT:    pmuludq %xmm8, %xmm10
983; SSE2-NEXT:    pmuludq %xmm9, %xmm10
984; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
985; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
986; SSE2-NEXT:    pmuludq %xmm8, %xmm9
987; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
988; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
989; SSE2-NEXT:    pmuludq %xmm8, %xmm11
990; SSE2-NEXT:    pmuludq %xmm9, %xmm11
991; SSE2-NEXT:    pmuludq %xmm10, %xmm11
992; SSE2-NEXT:    pmuludq %xmm6, %xmm2
993; SSE2-NEXT:    pmuludq %xmm4, %xmm0
994; SSE2-NEXT:    pmuludq %xmm2, %xmm0
995; SSE2-NEXT:    pmuludq %xmm7, %xmm3
996; SSE2-NEXT:    pmuludq %xmm5, %xmm1
997; SSE2-NEXT:    pmuludq %xmm3, %xmm1
998; SSE2-NEXT:    pmuludq %xmm0, %xmm1
999; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1000; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1001; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,2,2,2]
1002; SSE2-NEXT:    pmuludq %xmm11, %xmm1
1003; SSE2-NEXT:    pmuludq %xmm0, %xmm1
1004; SSE2-NEXT:    movd %xmm1, %eax
1005; SSE2-NEXT:    retq
1006;
1007; SSE41-LABEL: test_v32i32:
1008; SSE41:       # %bb.0:
1009; SSE41-NEXT:    pmulld %xmm6, %xmm2
1010; SSE41-NEXT:    pmulld %xmm4, %xmm0
1011; SSE41-NEXT:    pmulld %xmm2, %xmm0
1012; SSE41-NEXT:    pmulld %xmm7, %xmm3
1013; SSE41-NEXT:    pmulld %xmm5, %xmm1
1014; SSE41-NEXT:    pmulld %xmm3, %xmm1
1015; SSE41-NEXT:    pmulld %xmm0, %xmm1
1016; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1017; SSE41-NEXT:    pmulld %xmm1, %xmm0
1018; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1019; SSE41-NEXT:    pmulld %xmm0, %xmm1
1020; SSE41-NEXT:    movd %xmm1, %eax
1021; SSE41-NEXT:    retq
1022;
1023; AVX1-LABEL: test_v32i32:
1024; AVX1:       # %bb.0:
1025; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm4
1026; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm5
1027; AVX1-NEXT:    vpmulld %xmm4, %xmm5, %xmm4
1028; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1029; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1030; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
1031; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1032; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1033; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
1034; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1035; AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
1036; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1037; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1038; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1039; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1040; AVX1-NEXT:    vmovd %xmm0, %eax
1041; AVX1-NEXT:    vzeroupper
1042; AVX1-NEXT:    retq
1043;
1044; AVX2-LABEL: test_v32i32:
1045; AVX2:       # %bb.0:
1046; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
1047; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
1048; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1049; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1050; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1051; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1052; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1053; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1054; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1055; AVX2-NEXT:    vmovd %xmm0, %eax
1056; AVX2-NEXT:    vzeroupper
1057; AVX2-NEXT:    retq
1058;
1059; AVX512-LABEL: test_v32i32:
1060; AVX512:       # %bb.0:
1061; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1062; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1063; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1064; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1065; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1066; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1067; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1068; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1069; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1070; AVX512-NEXT:    vmovd %xmm0, %eax
1071; AVX512-NEXT:    vzeroupper
1072; AVX512-NEXT:    retq
1073  %1 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %a0)
1074  ret i32 %1
1075}
1076
1077;
1078; vXi16
1079;
1080
1081define i16 @test_v2i16(<2 x i16> %a0) {
1082; SSE-LABEL: test_v2i16:
1083; SSE:       # %bb.0:
1084; SSE-NEXT:    movdqa %xmm0, %xmm1
1085; SSE-NEXT:    psrld $16, %xmm1
1086; SSE-NEXT:    pmullw %xmm0, %xmm1
1087; SSE-NEXT:    movd %xmm1, %eax
1088; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1089; SSE-NEXT:    retq
1090;
1091; AVX-LABEL: test_v2i16:
1092; AVX:       # %bb.0:
1093; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1094; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1095; AVX-NEXT:    vmovd %xmm0, %eax
1096; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1097; AVX-NEXT:    retq
1098  %1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a0)
1099  ret i16 %1
1100}
1101
1102define i16 @test_v4i16(<4 x i16> %a0) {
1103; SSE-LABEL: test_v4i16:
1104; SSE:       # %bb.0:
1105; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1106; SSE-NEXT:    pmullw %xmm0, %xmm1
1107; SSE-NEXT:    movdqa %xmm1, %xmm0
1108; SSE-NEXT:    psrld $16, %xmm0
1109; SSE-NEXT:    pmullw %xmm1, %xmm0
1110; SSE-NEXT:    movd %xmm0, %eax
1111; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1112; SSE-NEXT:    retq
1113;
1114; AVX-LABEL: test_v4i16:
1115; AVX:       # %bb.0:
1116; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1117; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1118; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1119; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1120; AVX-NEXT:    vmovd %xmm0, %eax
1121; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1122; AVX-NEXT:    retq
1123  %1 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %a0)
1124  ret i16 %1
1125}
1126
1127define i16 @test_v8i16(<8 x i16> %a0) {
1128; SSE-LABEL: test_v8i16:
1129; SSE:       # %bb.0:
1130; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1131; SSE-NEXT:    pmullw %xmm0, %xmm1
1132; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1133; SSE-NEXT:    pmullw %xmm1, %xmm0
1134; SSE-NEXT:    movdqa %xmm0, %xmm1
1135; SSE-NEXT:    psrld $16, %xmm1
1136; SSE-NEXT:    pmullw %xmm0, %xmm1
1137; SSE-NEXT:    movd %xmm1, %eax
1138; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1139; SSE-NEXT:    retq
1140;
1141; AVX-LABEL: test_v8i16:
1142; AVX:       # %bb.0:
1143; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1144; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1145; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1146; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1147; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1148; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1149; AVX-NEXT:    vmovd %xmm0, %eax
1150; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1151; AVX-NEXT:    retq
1152  %1 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %a0)
1153  ret i16 %1
1154}
1155
1156define i16 @test_v16i16(<16 x i16> %a0) {
1157; SSE-LABEL: test_v16i16:
1158; SSE:       # %bb.0:
1159; SSE-NEXT:    pmullw %xmm1, %xmm0
1160; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1161; SSE-NEXT:    pmullw %xmm0, %xmm1
1162; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1163; SSE-NEXT:    pmullw %xmm1, %xmm0
1164; SSE-NEXT:    movdqa %xmm0, %xmm1
1165; SSE-NEXT:    psrld $16, %xmm1
1166; SSE-NEXT:    pmullw %xmm0, %xmm1
1167; SSE-NEXT:    movd %xmm1, %eax
1168; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1169; SSE-NEXT:    retq
1170;
1171; AVX1-LABEL: test_v16i16:
1172; AVX1:       # %bb.0:
1173; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1174; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1175; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1176; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1177; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1178; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1179; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1180; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1181; AVX1-NEXT:    vmovd %xmm0, %eax
1182; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1183; AVX1-NEXT:    vzeroupper
1184; AVX1-NEXT:    retq
1185;
1186; AVX2-LABEL: test_v16i16:
1187; AVX2:       # %bb.0:
1188; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1189; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1190; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1191; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1192; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1193; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1194; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1195; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1196; AVX2-NEXT:    vmovd %xmm0, %eax
1197; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1198; AVX2-NEXT:    vzeroupper
1199; AVX2-NEXT:    retq
1200;
1201; AVX512-LABEL: test_v16i16:
1202; AVX512:       # %bb.0:
1203; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1204; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1205; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1206; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1207; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1208; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1209; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1210; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1211; AVX512-NEXT:    vmovd %xmm0, %eax
1212; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1213; AVX512-NEXT:    vzeroupper
1214; AVX512-NEXT:    retq
1215  %1 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %a0)
1216  ret i16 %1
1217}
1218
1219define i16 @test_v32i16(<32 x i16> %a0) {
1220; SSE-LABEL: test_v32i16:
1221; SSE:       # %bb.0:
1222; SSE-NEXT:    pmullw %xmm3, %xmm1
1223; SSE-NEXT:    pmullw %xmm2, %xmm0
1224; SSE-NEXT:    pmullw %xmm1, %xmm0
1225; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1226; SSE-NEXT:    pmullw %xmm0, %xmm1
1227; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1228; SSE-NEXT:    pmullw %xmm1, %xmm0
1229; SSE-NEXT:    movdqa %xmm0, %xmm1
1230; SSE-NEXT:    psrld $16, %xmm1
1231; SSE-NEXT:    pmullw %xmm0, %xmm1
1232; SSE-NEXT:    movd %xmm1, %eax
1233; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1234; SSE-NEXT:    retq
1235;
1236; AVX1-LABEL: test_v32i16:
1237; AVX1:       # %bb.0:
1238; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1239; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1240; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
1241; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1242; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1243; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1244; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1245; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1246; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1247; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1248; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1249; AVX1-NEXT:    vmovd %xmm0, %eax
1250; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1251; AVX1-NEXT:    vzeroupper
1252; AVX1-NEXT:    retq
1253;
1254; AVX2-LABEL: test_v32i16:
1255; AVX2:       # %bb.0:
1256; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1257; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1258; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1259; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1260; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1261; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1262; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1263; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1264; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1265; AVX2-NEXT:    vmovd %xmm0, %eax
1266; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1267; AVX2-NEXT:    vzeroupper
1268; AVX2-NEXT:    retq
1269;
1270; AVX512BW-LABEL: test_v32i16:
1271; AVX512BW:       # %bb.0:
1272; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1273; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1274; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1275; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1276; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1277; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1278; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1279; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1280; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1281; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1282; AVX512BW-NEXT:    vmovd %xmm0, %eax
1283; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
1284; AVX512BW-NEXT:    vzeroupper
1285; AVX512BW-NEXT:    retq
1286;
1287; AVX512BWVL-LABEL: test_v32i16:
1288; AVX512BWVL:       # %bb.0:
1289; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1290; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1291; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1292; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1293; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1294; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1295; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1296; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1297; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1298; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1299; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1300; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
1301; AVX512BWVL-NEXT:    vzeroupper
1302; AVX512BWVL-NEXT:    retq
1303;
1304; AVX512DQ-LABEL: test_v32i16:
1305; AVX512DQ:       # %bb.0:
1306; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1307; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1308; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1309; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1310; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1311; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1312; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1313; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1314; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1315; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1316; AVX512DQ-NEXT:    vmovd %xmm0, %eax
1317; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
1318; AVX512DQ-NEXT:    vzeroupper
1319; AVX512DQ-NEXT:    retq
1320;
1321; AVX512DQVL-LABEL: test_v32i16:
1322; AVX512DQVL:       # %bb.0:
1323; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1324; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1325; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1326; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1327; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1328; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1329; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1330; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1331; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1332; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1333; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
1334; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
1335; AVX512DQVL-NEXT:    vzeroupper
1336; AVX512DQVL-NEXT:    retq
1337  %1 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %a0)
1338  ret i16 %1
1339}
1340
1341define i16 @test_v64i16(<64 x i16> %a0) {
1342; SSE-LABEL: test_v64i16:
1343; SSE:       # %bb.0:
1344; SSE-NEXT:    pmullw %xmm6, %xmm2
1345; SSE-NEXT:    pmullw %xmm4, %xmm0
1346; SSE-NEXT:    pmullw %xmm2, %xmm0
1347; SSE-NEXT:    pmullw %xmm7, %xmm3
1348; SSE-NEXT:    pmullw %xmm5, %xmm1
1349; SSE-NEXT:    pmullw %xmm3, %xmm1
1350; SSE-NEXT:    pmullw %xmm0, %xmm1
1351; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1352; SSE-NEXT:    pmullw %xmm1, %xmm0
1353; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1354; SSE-NEXT:    pmullw %xmm0, %xmm1
1355; SSE-NEXT:    movdqa %xmm1, %xmm0
1356; SSE-NEXT:    psrld $16, %xmm0
1357; SSE-NEXT:    pmullw %xmm1, %xmm0
1358; SSE-NEXT:    movd %xmm0, %eax
1359; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1360; SSE-NEXT:    retq
1361;
1362; AVX1-LABEL: test_v64i16:
1363; AVX1:       # %bb.0:
1364; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
1365; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm5
1366; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
1367; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1368; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1369; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1370; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1371; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1372; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1373; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1374; AVX1-NEXT:    vpmullw %xmm0, %xmm4, %xmm0
1375; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1376; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1377; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1378; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1379; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1380; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1381; AVX1-NEXT:    vmovd %xmm0, %eax
1382; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1383; AVX1-NEXT:    vzeroupper
1384; AVX1-NEXT:    retq
1385;
1386; AVX2-LABEL: test_v64i16:
1387; AVX2:       # %bb.0:
1388; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1389; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1390; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1391; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1392; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1393; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1394; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1395; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1396; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1397; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1398; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1399; AVX2-NEXT:    vmovd %xmm0, %eax
1400; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1401; AVX2-NEXT:    vzeroupper
1402; AVX2-NEXT:    retq
1403;
1404; AVX512BW-LABEL: test_v64i16:
1405; AVX512BW:       # %bb.0:
1406; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1407; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1408; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1409; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1410; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1411; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1412; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1413; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1414; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1415; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1416; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1417; AVX512BW-NEXT:    vmovd %xmm0, %eax
1418; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
1419; AVX512BW-NEXT:    vzeroupper
1420; AVX512BW-NEXT:    retq
1421;
1422; AVX512BWVL-LABEL: test_v64i16:
1423; AVX512BWVL:       # %bb.0:
1424; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1425; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1426; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1427; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1428; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1429; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1430; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1431; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1432; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1433; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1434; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1435; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1436; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
1437; AVX512BWVL-NEXT:    vzeroupper
1438; AVX512BWVL-NEXT:    retq
1439;
1440; AVX512DQ-LABEL: test_v64i16:
1441; AVX512DQ:       # %bb.0:
1442; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
1443; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1444; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
1445; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1446; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1447; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1448; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1449; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1450; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1451; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1452; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1453; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1454; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1455; AVX512DQ-NEXT:    vmovd %xmm0, %eax
1456; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
1457; AVX512DQ-NEXT:    vzeroupper
1458; AVX512DQ-NEXT:    retq
1459;
1460; AVX512DQVL-LABEL: test_v64i16:
1461; AVX512DQVL:       # %bb.0:
1462; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
1463; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1464; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
1465; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1466; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1467; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1468; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1469; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1470; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1471; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1472; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1473; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1474; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1475; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
1476; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
1477; AVX512DQVL-NEXT:    vzeroupper
1478; AVX512DQVL-NEXT:    retq
1479  %1 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %a0)
1480  ret i16 %1
1481}
1482
1483;
1484; vXi8
1485;
1486
1487define i8 @test_v2i8(<2 x i8> %a0) {
1488; SSE-LABEL: test_v2i8:
1489; SSE:       # %bb.0:
1490; SSE-NEXT:    movdqa %xmm0, %xmm1
1491; SSE-NEXT:    psrlw $8, %xmm1
1492; SSE-NEXT:    pmullw %xmm0, %xmm1
1493; SSE-NEXT:    movd %xmm1, %eax
1494; SSE-NEXT:    # kill: def $al killed $al killed $eax
1495; SSE-NEXT:    retq
1496;
1497; AVX-LABEL: test_v2i8:
1498; AVX:       # %bb.0:
1499; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
1500; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1501; AVX-NEXT:    vmovd %xmm0, %eax
1502; AVX-NEXT:    # kill: def $al killed $al killed $eax
1503; AVX-NEXT:    retq
1504  %1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a0)
1505  ret i8 %1
1506}
1507
1508define i8 @test_v4i8(<4 x i8> %a0) {
1509; SSE2-LABEL: test_v4i8:
1510; SSE2:       # %bb.0:
1511; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1512; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1513; SSE2-NEXT:    pmullw %xmm0, %xmm1
1514; SSE2-NEXT:    movdqa %xmm1, %xmm0
1515; SSE2-NEXT:    psrld $16, %xmm0
1516; SSE2-NEXT:    pmullw %xmm1, %xmm0
1517; SSE2-NEXT:    movd %xmm0, %eax
1518; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1519; SSE2-NEXT:    retq
1520;
1521; SSE41-LABEL: test_v4i8:
1522; SSE41:       # %bb.0:
1523; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1524; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1525; SSE41-NEXT:    pmullw %xmm0, %xmm1
1526; SSE41-NEXT:    movdqa %xmm1, %xmm0
1527; SSE41-NEXT:    psrld $16, %xmm0
1528; SSE41-NEXT:    pmullw %xmm1, %xmm0
1529; SSE41-NEXT:    movd %xmm0, %eax
1530; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1531; SSE41-NEXT:    retq
1532;
1533; AVX-LABEL: test_v4i8:
1534; AVX:       # %bb.0:
1535; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1536; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1537; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1538; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1539; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1540; AVX-NEXT:    vmovd %xmm0, %eax
1541; AVX-NEXT:    # kill: def $al killed $al killed $eax
1542; AVX-NEXT:    retq
1543  %1 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %a0)
1544  ret i8 %1
1545}
1546
1547define i8 @test_v8i8(<8 x i8> %a0) {
1548; SSE2-LABEL: test_v8i8:
1549; SSE2:       # %bb.0:
1550; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1551; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1552; SSE2-NEXT:    pmullw %xmm0, %xmm1
1553; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1554; SSE2-NEXT:    pmullw %xmm1, %xmm0
1555; SSE2-NEXT:    movdqa %xmm0, %xmm1
1556; SSE2-NEXT:    psrld $16, %xmm1
1557; SSE2-NEXT:    pmullw %xmm0, %xmm1
1558; SSE2-NEXT:    movd %xmm1, %eax
1559; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1560; SSE2-NEXT:    retq
1561;
1562; SSE41-LABEL: test_v8i8:
1563; SSE41:       # %bb.0:
1564; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1565; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1566; SSE41-NEXT:    pmullw %xmm0, %xmm1
1567; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1568; SSE41-NEXT:    pmullw %xmm1, %xmm0
1569; SSE41-NEXT:    movdqa %xmm0, %xmm1
1570; SSE41-NEXT:    psrld $16, %xmm1
1571; SSE41-NEXT:    pmullw %xmm0, %xmm1
1572; SSE41-NEXT:    movd %xmm1, %eax
1573; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1574; SSE41-NEXT:    retq
1575;
1576; AVX-LABEL: test_v8i8:
1577; AVX:       # %bb.0:
1578; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1579; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1580; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1581; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1582; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1583; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1584; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1585; AVX-NEXT:    vmovd %xmm0, %eax
1586; AVX-NEXT:    # kill: def $al killed $al killed $eax
1587; AVX-NEXT:    retq
1588  %1 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %a0)
1589  ret i8 %1
1590}
1591
1592define i8 @test_v16i8(<16 x i8> %a0) {
1593; SSE2-LABEL: test_v16i8:
1594; SSE2:       # %bb.0:
1595; SSE2-NEXT:    movdqa %xmm0, %xmm1
1596; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1597; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1598; SSE2-NEXT:    pmullw %xmm1, %xmm0
1599; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1600; SSE2-NEXT:    pmullw %xmm0, %xmm1
1601; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1602; SSE2-NEXT:    pmullw %xmm1, %xmm0
1603; SSE2-NEXT:    movdqa %xmm0, %xmm1
1604; SSE2-NEXT:    psrld $16, %xmm1
1605; SSE2-NEXT:    pmullw %xmm0, %xmm1
1606; SSE2-NEXT:    movd %xmm1, %eax
1607; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1608; SSE2-NEXT:    retq
1609;
1610; SSE41-LABEL: test_v16i8:
1611; SSE41:       # %bb.0:
1612; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1613; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1614; SSE41-NEXT:    pmullw %xmm1, %xmm0
1615; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1616; SSE41-NEXT:    pmullw %xmm0, %xmm1
1617; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1618; SSE41-NEXT:    pmullw %xmm1, %xmm0
1619; SSE41-NEXT:    movdqa %xmm0, %xmm1
1620; SSE41-NEXT:    psrld $16, %xmm1
1621; SSE41-NEXT:    pmullw %xmm0, %xmm1
1622; SSE41-NEXT:    movd %xmm1, %eax
1623; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1624; SSE41-NEXT:    retq
1625;
1626; AVX-LABEL: test_v16i8:
1627; AVX:       # %bb.0:
1628; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1629; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1630; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1631; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1632; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1633; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1634; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1635; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1636; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1637; AVX-NEXT:    vmovd %xmm0, %eax
1638; AVX-NEXT:    # kill: def $al killed $al killed $eax
1639; AVX-NEXT:    retq
1640  %1 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %a0)
1641  ret i8 %1
1642}
1643
1644define i8 @test_v32i8(<32 x i8> %a0) {
1645; SSE2-LABEL: test_v32i8:
1646; SSE2:       # %bb.0:
1647; SSE2-NEXT:    movdqa %xmm1, %xmm2
1648; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1649; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1650; SSE2-NEXT:    pmullw %xmm2, %xmm1
1651; SSE2-NEXT:    movdqa %xmm0, %xmm2
1652; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1653; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1654; SSE2-NEXT:    pmullw %xmm2, %xmm0
1655; SSE2-NEXT:    pmullw %xmm1, %xmm0
1656; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1657; SSE2-NEXT:    pmullw %xmm0, %xmm1
1658; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1659; SSE2-NEXT:    pmullw %xmm1, %xmm0
1660; SSE2-NEXT:    movdqa %xmm0, %xmm1
1661; SSE2-NEXT:    psrld $16, %xmm1
1662; SSE2-NEXT:    pmullw %xmm0, %xmm1
1663; SSE2-NEXT:    movd %xmm1, %eax
1664; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1665; SSE2-NEXT:    retq
1666;
1667; SSE41-LABEL: test_v32i8:
1668; SSE41:       # %bb.0:
1669; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1670; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1671; SSE41-NEXT:    pmullw %xmm2, %xmm1
1672; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1673; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1674; SSE41-NEXT:    pmullw %xmm2, %xmm0
1675; SSE41-NEXT:    pmullw %xmm1, %xmm0
1676; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1677; SSE41-NEXT:    pmullw %xmm0, %xmm1
1678; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1679; SSE41-NEXT:    pmullw %xmm1, %xmm0
1680; SSE41-NEXT:    movdqa %xmm0, %xmm1
1681; SSE41-NEXT:    psrld $16, %xmm1
1682; SSE41-NEXT:    pmullw %xmm0, %xmm1
1683; SSE41-NEXT:    movd %xmm1, %eax
1684; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1685; SSE41-NEXT:    retq
1686;
1687; AVX1-LABEL: test_v32i8:
1688; AVX1:       # %bb.0:
1689; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1690; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1691; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1692; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1693; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1694; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1695; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1696; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1697; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1698; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1699; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1700; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1701; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1702; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1703; AVX1-NEXT:    vmovd %xmm0, %eax
1704; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1705; AVX1-NEXT:    vzeroupper
1706; AVX1-NEXT:    retq
1707;
1708; AVX2-LABEL: test_v32i8:
1709; AVX2:       # %bb.0:
1710; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1711; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1712; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1713; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1714; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1715; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1716; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1717; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1718; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1719; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1720; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1721; AVX2-NEXT:    vmovd %xmm0, %eax
1722; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1723; AVX2-NEXT:    vzeroupper
1724; AVX2-NEXT:    retq
1725;
1726; AVX512-LABEL: test_v32i8:
1727; AVX512:       # %bb.0:
1728; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1729; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1730; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1731; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1732; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1733; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1734; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1735; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1736; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1737; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1738; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1739; AVX512-NEXT:    vmovd %xmm0, %eax
1740; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1741; AVX512-NEXT:    vzeroupper
1742; AVX512-NEXT:    retq
1743  %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0)
1744  ret i8 %1
1745}
1746
1747define i8 @test_v64i8(<64 x i8> %a0) {
1748; SSE2-LABEL: test_v64i8:
1749; SSE2:       # %bb.0:
1750; SSE2-NEXT:    movdqa %xmm3, %xmm4
1751; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1752; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1753; SSE2-NEXT:    pmullw %xmm4, %xmm3
1754; SSE2-NEXT:    movdqa %xmm1, %xmm4
1755; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1756; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1757; SSE2-NEXT:    pmullw %xmm4, %xmm1
1758; SSE2-NEXT:    pmullw %xmm3, %xmm1
1759; SSE2-NEXT:    movdqa %xmm2, %xmm3
1760; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1761; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1762; SSE2-NEXT:    pmullw %xmm3, %xmm2
1763; SSE2-NEXT:    movdqa %xmm0, %xmm3
1764; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1765; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1766; SSE2-NEXT:    pmullw %xmm3, %xmm0
1767; SSE2-NEXT:    pmullw %xmm2, %xmm0
1768; SSE2-NEXT:    pmullw %xmm1, %xmm0
1769; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1770; SSE2-NEXT:    pmullw %xmm0, %xmm1
1771; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1772; SSE2-NEXT:    pmullw %xmm1, %xmm0
1773; SSE2-NEXT:    movdqa %xmm0, %xmm1
1774; SSE2-NEXT:    psrld $16, %xmm1
1775; SSE2-NEXT:    pmullw %xmm0, %xmm1
1776; SSE2-NEXT:    movd %xmm1, %eax
1777; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1778; SSE2-NEXT:    retq
1779;
1780; SSE41-LABEL: test_v64i8:
1781; SSE41:       # %bb.0:
1782; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1783; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1784; SSE41-NEXT:    pmullw %xmm4, %xmm3
1785; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1786; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1787; SSE41-NEXT:    pmullw %xmm4, %xmm1
1788; SSE41-NEXT:    pmullw %xmm3, %xmm1
1789; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1790; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1791; SSE41-NEXT:    pmullw %xmm3, %xmm2
1792; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1793; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1794; SSE41-NEXT:    pmullw %xmm3, %xmm0
1795; SSE41-NEXT:    pmullw %xmm2, %xmm0
1796; SSE41-NEXT:    pmullw %xmm1, %xmm0
1797; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1798; SSE41-NEXT:    pmullw %xmm0, %xmm1
1799; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1800; SSE41-NEXT:    pmullw %xmm1, %xmm0
1801; SSE41-NEXT:    movdqa %xmm0, %xmm1
1802; SSE41-NEXT:    psrld $16, %xmm1
1803; SSE41-NEXT:    pmullw %xmm0, %xmm1
1804; SSE41-NEXT:    movd %xmm1, %eax
1805; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1806; SSE41-NEXT:    retq
1807;
1808; AVX1-LABEL: test_v64i8:
1809; AVX1:       # %bb.0:
1810; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1811; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1812; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1813; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1814; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1815; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1816; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1817; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1818; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
1819; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1820; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1821; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1822; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1823; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1824; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
1825; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1826; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1827; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1828; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1829; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1830; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1831; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1832; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1833; AVX1-NEXT:    vmovd %xmm0, %eax
1834; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1835; AVX1-NEXT:    vzeroupper
1836; AVX1-NEXT:    retq
1837;
1838; AVX2-LABEL: test_v64i8:
1839; AVX2:       # %bb.0:
1840; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1841; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1842; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
1843; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1844; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1845; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1846; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1847; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1848; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1849; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1850; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1851; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1852; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1853; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1854; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1855; AVX2-NEXT:    vmovd %xmm0, %eax
1856; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1857; AVX2-NEXT:    vzeroupper
1858; AVX2-NEXT:    retq
1859;
1860; AVX512BW-LABEL: test_v64i8:
1861; AVX512BW:       # %bb.0:
1862; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1863; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1864; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1865; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1866; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1867; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1868; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1869; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1870; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1871; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1872; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1873; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1874; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1875; AVX512BW-NEXT:    vmovd %xmm0, %eax
1876; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
1877; AVX512BW-NEXT:    vzeroupper
1878; AVX512BW-NEXT:    retq
1879;
1880; AVX512BWVL-LABEL: test_v64i8:
1881; AVX512BWVL:       # %bb.0:
1882; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1883; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1884; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1885; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1886; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1887; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1888; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1889; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1890; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1891; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1892; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1893; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1894; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1895; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1896; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
1897; AVX512BWVL-NEXT:    vzeroupper
1898; AVX512BWVL-NEXT:    retq
1899;
1900; AVX512DQ-LABEL: test_v64i8:
1901; AVX512DQ:       # %bb.0:
1902; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1903; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1904; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1905; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
1906; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1907; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1908; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1909; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1910; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1911; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1912; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1913; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1914; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1915; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1916; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1917; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1918; AVX512DQ-NEXT:    vmovd %xmm0, %eax
1919; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
1920; AVX512DQ-NEXT:    vzeroupper
1921; AVX512DQ-NEXT:    retq
1922;
1923; AVX512DQVL-LABEL: test_v64i8:
1924; AVX512DQVL:       # %bb.0:
1925; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1926; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1927; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1928; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
1929; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1930; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1931; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1932; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1933; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1934; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1935; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1936; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1937; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1938; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1939; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1940; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1941; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
1942; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
1943; AVX512DQVL-NEXT:    vzeroupper
1944; AVX512DQVL-NEXT:    retq
1945  %1 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %a0)
1946  ret i8 %1
1947}
1948
1949define i8 @test_v128i8(<128 x i8> %a0) {
1950; SSE2-LABEL: test_v128i8:
1951; SSE2:       # %bb.0:
1952; SSE2-NEXT:    movdqa %xmm7, %xmm8
1953; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1954; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1955; SSE2-NEXT:    pmullw %xmm8, %xmm7
1956; SSE2-NEXT:    movdqa %xmm3, %xmm8
1957; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1958; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1959; SSE2-NEXT:    pmullw %xmm8, %xmm3
1960; SSE2-NEXT:    pmullw %xmm7, %xmm3
1961; SSE2-NEXT:    movdqa %xmm5, %xmm7
1962; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1963; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1964; SSE2-NEXT:    pmullw %xmm7, %xmm5
1965; SSE2-NEXT:    movdqa %xmm1, %xmm7
1966; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1967; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1968; SSE2-NEXT:    pmullw %xmm7, %xmm1
1969; SSE2-NEXT:    pmullw %xmm5, %xmm1
1970; SSE2-NEXT:    pmullw %xmm3, %xmm1
1971; SSE2-NEXT:    movdqa %xmm6, %xmm3
1972; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1973; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1974; SSE2-NEXT:    pmullw %xmm3, %xmm6
1975; SSE2-NEXT:    movdqa %xmm2, %xmm3
1976; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1977; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1978; SSE2-NEXT:    pmullw %xmm3, %xmm2
1979; SSE2-NEXT:    pmullw %xmm6, %xmm2
1980; SSE2-NEXT:    movdqa %xmm4, %xmm3
1981; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1982; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1983; SSE2-NEXT:    pmullw %xmm3, %xmm4
1984; SSE2-NEXT:    movdqa %xmm0, %xmm3
1985; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1986; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1987; SSE2-NEXT:    pmullw %xmm3, %xmm0
1988; SSE2-NEXT:    pmullw %xmm4, %xmm0
1989; SSE2-NEXT:    pmullw %xmm2, %xmm0
1990; SSE2-NEXT:    pmullw %xmm1, %xmm0
1991; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1992; SSE2-NEXT:    pmullw %xmm0, %xmm1
1993; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1994; SSE2-NEXT:    pmullw %xmm1, %xmm0
1995; SSE2-NEXT:    movdqa %xmm0, %xmm1
1996; SSE2-NEXT:    psrld $16, %xmm1
1997; SSE2-NEXT:    pmullw %xmm0, %xmm1
1998; SSE2-NEXT:    movd %xmm1, %eax
1999; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2000; SSE2-NEXT:    retq
2001;
2002; SSE41-LABEL: test_v128i8:
2003; SSE41:       # %bb.0:
2004; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
2005; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2006; SSE41-NEXT:    pmullw %xmm8, %xmm7
2007; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2008; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2009; SSE41-NEXT:    pmullw %xmm8, %xmm3
2010; SSE41-NEXT:    pmullw %xmm7, %xmm3
2011; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2012; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2013; SSE41-NEXT:    pmullw %xmm7, %xmm5
2014; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2015; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2016; SSE41-NEXT:    pmullw %xmm7, %xmm1
2017; SSE41-NEXT:    pmullw %xmm5, %xmm1
2018; SSE41-NEXT:    pmullw %xmm3, %xmm1
2019; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2020; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2021; SSE41-NEXT:    pmullw %xmm3, %xmm6
2022; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2023; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2024; SSE41-NEXT:    pmullw %xmm3, %xmm2
2025; SSE41-NEXT:    pmullw %xmm6, %xmm2
2026; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2027; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2028; SSE41-NEXT:    pmullw %xmm3, %xmm4
2029; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2030; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2031; SSE41-NEXT:    pmullw %xmm3, %xmm0
2032; SSE41-NEXT:    pmullw %xmm4, %xmm0
2033; SSE41-NEXT:    pmullw %xmm2, %xmm0
2034; SSE41-NEXT:    pmullw %xmm1, %xmm0
2035; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2036; SSE41-NEXT:    pmullw %xmm0, %xmm1
2037; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2038; SSE41-NEXT:    pmullw %xmm1, %xmm0
2039; SSE41-NEXT:    movdqa %xmm0, %xmm1
2040; SSE41-NEXT:    psrld $16, %xmm1
2041; SSE41-NEXT:    pmullw %xmm0, %xmm1
2042; SSE41-NEXT:    movd %xmm1, %eax
2043; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2044; SSE41-NEXT:    retq
2045;
2046; AVX1-LABEL: test_v128i8:
2047; AVX1:       # %bb.0:
2048; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
2049; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2050; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2051; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
2052; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
2053; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2054; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2055; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
2056; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
2057; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
2058; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2059; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2060; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
2061; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
2062; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2063; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2064; AVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
2065; AVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
2066; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
2067; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2068; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2069; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
2070; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2071; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2072; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
2073; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
2074; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2075; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2076; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
2077; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2078; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2079; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
2080; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
2081; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2082; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
2083; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2084; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2085; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2086; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2087; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
2088; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2089; AVX1-NEXT:    vmovd %xmm0, %eax
2090; AVX1-NEXT:    # kill: def $al killed $al killed $eax
2091; AVX1-NEXT:    vzeroupper
2092; AVX1-NEXT:    retq
2093;
2094; AVX2-LABEL: test_v128i8:
2095; AVX2:       # %bb.0:
2096; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2097; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2098; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
2099; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2100; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2101; AVX2-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
2102; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2103; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2104; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2105; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
2106; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2107; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2108; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
2109; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2110; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2111; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2112; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2113; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2114; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2115; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2116; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2117; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
2118; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2119; AVX2-NEXT:    vmovd %xmm0, %eax
2120; AVX2-NEXT:    # kill: def $al killed $al killed $eax
2121; AVX2-NEXT:    vzeroupper
2122; AVX2-NEXT:    retq
2123;
2124; AVX512BW-LABEL: test_v128i8:
2125; AVX512BW:       # %bb.0:
2126; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2127; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2128; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
2129; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2130; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2131; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
2132; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2133; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2134; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2135; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2136; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2137; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2138; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2139; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2140; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2141; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
2142; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2143; AVX512BW-NEXT:    vmovd %xmm0, %eax
2144; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
2145; AVX512BW-NEXT:    vzeroupper
2146; AVX512BW-NEXT:    retq
2147;
2148; AVX512BWVL-LABEL: test_v128i8:
2149; AVX512BWVL:       # %bb.0:
2150; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2151; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2152; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
2153; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2154; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2155; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
2156; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2157; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2158; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2159; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
2160; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2161; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2162; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2163; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2164; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2165; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2166; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2167; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
2168; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
2169; AVX512BWVL-NEXT:    vzeroupper
2170; AVX512BWVL-NEXT:    retq
2171;
2172; AVX512DQ-LABEL: test_v128i8:
2173; AVX512DQ:       # %bb.0:
2174; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2175; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2176; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2177; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
2178; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2179; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2180; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2181; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
2182; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
2183; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2184; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2185; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2186; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2187; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2188; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
2189; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2190; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2191; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
2192; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2193; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2194; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2195; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2196; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2197; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
2198; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2199; AVX512DQ-NEXT:    vmovd %xmm0, %eax
2200; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
2201; AVX512DQ-NEXT:    vzeroupper
2202; AVX512DQ-NEXT:    retq
2203;
2204; AVX512DQVL-LABEL: test_v128i8:
2205; AVX512DQVL:       # %bb.0:
2206; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2207; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2208; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2209; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
2210; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2211; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2212; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2213; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
2214; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
2215; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2216; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2217; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2218; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2219; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2220; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
2221; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2222; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2223; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
2224; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2225; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2226; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2227; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2228; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2229; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2230; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2231; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
2232; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
2233; AVX512DQVL-NEXT:    vzeroupper
2234; AVX512DQVL-NEXT:    retq
2235  %1 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %a0)
2236  ret i8 %1
2237}
2238
2239;
2240; Legalization
2241;
2242
2243define i8 @illegal_v4i8(i8 %a0, ptr %a1) {
2244; SSE2-LABEL: illegal_v4i8:
2245; SSE2:       # %bb.0:
2246; SSE2-NEXT:    movl %edi, %eax
2247; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2248; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2249; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2250; SSE2-NEXT:    pmullw %xmm0, %xmm1
2251; SSE2-NEXT:    movdqa %xmm1, %xmm0
2252; SSE2-NEXT:    psrld $16, %xmm0
2253; SSE2-NEXT:    pmullw %xmm1, %xmm0
2254; SSE2-NEXT:    movd %xmm0, %ecx
2255; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2256; SSE2-NEXT:    mulb %cl
2257; SSE2-NEXT:    retq
2258;
2259; SSE41-LABEL: illegal_v4i8:
2260; SSE41:       # %bb.0:
2261; SSE41-NEXT:    movl %edi, %eax
2262; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2263; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2264; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2265; SSE41-NEXT:    pmullw %xmm0, %xmm1
2266; SSE41-NEXT:    movdqa %xmm1, %xmm0
2267; SSE41-NEXT:    psrld $16, %xmm0
2268; SSE41-NEXT:    pmullw %xmm1, %xmm0
2269; SSE41-NEXT:    movd %xmm0, %ecx
2270; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2271; SSE41-NEXT:    mulb %cl
2272; SSE41-NEXT:    retq
2273;
2274; AVX-LABEL: illegal_v4i8:
2275; AVX:       # %bb.0:
2276; AVX-NEXT:    movl %edi, %eax
2277; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2278; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2279; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2280; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2281; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
2282; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2283; AVX-NEXT:    vmovd %xmm0, %ecx
2284; AVX-NEXT:    # kill: def $al killed $al killed $eax
2285; AVX-NEXT:    mulb %cl
2286; AVX-NEXT:    retq
2287  %ld = load <4 x i8>, ptr %a1, align 4
2288  %rdx = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %ld)
2289  %mul = mul i8 %a0, %rdx
2290  ret i8 %mul
2291}
2292
2293define i8 @illegal_v8i8(i8 %a0, ptr %a1) {
2294; SSE2-LABEL: illegal_v8i8:
2295; SSE2:       # %bb.0:
2296; SSE2-NEXT:    movl %edi, %eax
2297; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2298; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2299; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2300; SSE2-NEXT:    pmullw %xmm0, %xmm1
2301; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2302; SSE2-NEXT:    pmullw %xmm1, %xmm0
2303; SSE2-NEXT:    movdqa %xmm0, %xmm1
2304; SSE2-NEXT:    psrld $16, %xmm1
2305; SSE2-NEXT:    pmullw %xmm0, %xmm1
2306; SSE2-NEXT:    movd %xmm1, %ecx
2307; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2308; SSE2-NEXT:    mulb %cl
2309; SSE2-NEXT:    retq
2310;
2311; SSE41-LABEL: illegal_v8i8:
2312; SSE41:       # %bb.0:
2313; SSE41-NEXT:    movl %edi, %eax
2314; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2315; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2316; SSE41-NEXT:    pmullw %xmm0, %xmm1
2317; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2318; SSE41-NEXT:    pmullw %xmm1, %xmm0
2319; SSE41-NEXT:    movdqa %xmm0, %xmm1
2320; SSE41-NEXT:    psrld $16, %xmm1
2321; SSE41-NEXT:    pmullw %xmm0, %xmm1
2322; SSE41-NEXT:    movd %xmm1, %ecx
2323; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2324; SSE41-NEXT:    mulb %cl
2325; SSE41-NEXT:    retq
2326;
2327; AVX-LABEL: illegal_v8i8:
2328; AVX:       # %bb.0:
2329; AVX-NEXT:    movl %edi, %eax
2330; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2331; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2332; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2333; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2334; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2335; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
2336; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2337; AVX-NEXT:    vmovd %xmm0, %ecx
2338; AVX-NEXT:    # kill: def $al killed $al killed $eax
2339; AVX-NEXT:    mulb %cl
2340; AVX-NEXT:    retq
2341  %ld = load <8 x i8>, ptr %a1, align 4
2342  %rdx = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %ld)
2343  %mul = mul i8 %a0, %rdx
2344  ret i8 %mul
2345}
2346
2347define i8 @PR51858(i128 %arg) {
2348; SSE2-LABEL: PR51858:
2349; SSE2:       # %bb.0:
2350; SSE2-NEXT:    movq %rdi, %xmm0
2351; SSE2-NEXT:    movq %rsi, %xmm1
2352; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2353; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2354; SSE2-NEXT:    pmullw %xmm1, %xmm0
2355; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2356; SSE2-NEXT:    pmullw %xmm0, %xmm1
2357; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2358; SSE2-NEXT:    pmullw %xmm1, %xmm0
2359; SSE2-NEXT:    movdqa %xmm0, %xmm1
2360; SSE2-NEXT:    psrld $16, %xmm1
2361; SSE2-NEXT:    pmullw %xmm0, %xmm1
2362; SSE2-NEXT:    movd %xmm1, %eax
2363; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2364; SSE2-NEXT:    retq
2365;
2366; SSE41-LABEL: PR51858:
2367; SSE41:       # %bb.0:
2368; SSE41-NEXT:    movq %rdi, %xmm0
2369; SSE41-NEXT:    movq %rsi, %xmm1
2370; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2371; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2372; SSE41-NEXT:    pmullw %xmm1, %xmm0
2373; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2374; SSE41-NEXT:    pmullw %xmm0, %xmm1
2375; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2376; SSE41-NEXT:    pmullw %xmm1, %xmm0
2377; SSE41-NEXT:    movdqa %xmm0, %xmm1
2378; SSE41-NEXT:    psrld $16, %xmm1
2379; SSE41-NEXT:    pmullw %xmm0, %xmm1
2380; SSE41-NEXT:    movd %xmm1, %eax
2381; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2382; SSE41-NEXT:    retq
2383;
2384; AVX-LABEL: PR51858:
2385; AVX:       # %bb.0:
2386; AVX-NEXT:    vmovq %rdi, %xmm0
2387; AVX-NEXT:    vmovq %rsi, %xmm1
2388; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2389; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2390; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2391; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2392; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2393; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2394; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2395; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
2396; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2397; AVX-NEXT:    vmovd %xmm0, %eax
2398; AVX-NEXT:    # kill: def $al killed $al killed $eax
2399; AVX-NEXT:    retq
2400  %vec = bitcast i128 %arg to <16 x i8>
2401  %red = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %vec)
2402  ret i8 %red
2403}
2404
2405declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
2406declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
2407declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
2408declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
2409
2410declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
2411declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
2412declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
2413declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
2414declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
2415
2416declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
2417declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
2418declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
2419declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
2420declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
2421declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
2422
2423declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
2424declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
2425declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
2426declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
2427declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
2428declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
2429declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
2430