xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
9
10;
11; vXi64
12;
13
14define i64 @test_v2i64_v2i32(<2 x i32> %a0) {
15; SSE2-LABEL: test_v2i64_v2i32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    pxor %xmm1, %xmm1
18; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
19; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
20; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
21; SSE2-NEXT:    paddq %xmm0, %xmm1
22; SSE2-NEXT:    movq %xmm1, %rax
23; SSE2-NEXT:    retq
24;
25; SSE41-LABEL: test_v2i64_v2i32:
26; SSE41:       # %bb.0:
27; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
28; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
29; SSE41-NEXT:    paddq %xmm0, %xmm1
30; SSE41-NEXT:    movq %xmm1, %rax
31; SSE41-NEXT:    retq
32;
33; AVX-LABEL: test_v2i64_v2i32:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
36; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
37; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
38; AVX-NEXT:    vmovq %xmm0, %rax
39; AVX-NEXT:    retq
40  %1 = sext <2 x i32> %a0 to <2 x i64>
41  %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1)
42  ret i64 %2
43}
44
45define i64 @test_v4i64_v4i16(<4 x i16> %a0) {
46; SSE2-LABEL: test_v4i64_v4i16:
47; SSE2:       # %bb.0:
48; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
49; SSE2-NEXT:    psrad $16, %xmm0
50; SSE2-NEXT:    pxor %xmm1, %xmm1
51; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
52; SSE2-NEXT:    movdqa %xmm0, %xmm2
53; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
54; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
55; SSE2-NEXT:    paddq %xmm2, %xmm0
56; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
57; SSE2-NEXT:    paddq %xmm0, %xmm1
58; SSE2-NEXT:    movq %xmm1, %rax
59; SSE2-NEXT:    retq
60;
61; SSE41-LABEL: test_v4i64_v4i16:
62; SSE41:       # %bb.0:
63; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
64; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
65; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
66; SSE41-NEXT:    paddq %xmm1, %xmm0
67; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
68; SSE41-NEXT:    paddq %xmm0, %xmm1
69; SSE41-NEXT:    movq %xmm1, %rax
70; SSE41-NEXT:    retq
71;
72; AVX1-LABEL: test_v4i64_v4i16:
73; AVX1:       # %bb.0:
74; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
75; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
76; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
77; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
78; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
79; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
80; AVX1-NEXT:    vmovq %xmm0, %rax
81; AVX1-NEXT:    retq
82;
83; AVX2-LABEL: test_v4i64_v4i16:
84; AVX2:       # %bb.0:
85; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
86; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
87; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
88; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
89; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
90; AVX2-NEXT:    vmovq %xmm0, %rax
91; AVX2-NEXT:    vzeroupper
92; AVX2-NEXT:    retq
93;
94; AVX512-LABEL: test_v4i64_v4i16:
95; AVX512:       # %bb.0:
96; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
97; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
98; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
99; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
100; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
101; AVX512-NEXT:    vmovq %xmm0, %rax
102; AVX512-NEXT:    vzeroupper
103; AVX512-NEXT:    retq
104  %1 = sext <4 x i16> %a0 to <4 x i64>
105  %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1)
106  ret i64 %2
107}
108
109define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
110; SSE2-LABEL: test_v8i64_v8i8:
111; SSE2:       # %bb.0:
112; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
113; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
114; SSE2-NEXT:    psrad $24, %xmm1
115; SSE2-NEXT:    pxor %xmm2, %xmm2
116; SSE2-NEXT:    pxor %xmm3, %xmm3
117; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
118; SSE2-NEXT:    movdqa %xmm1, %xmm4
119; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
120; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
121; SSE2-NEXT:    psrad $24, %xmm0
122; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
123; SSE2-NEXT:    movdqa %xmm0, %xmm5
124; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
125; SSE2-NEXT:    paddq %xmm4, %xmm5
126; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
127; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
128; SSE2-NEXT:    paddq %xmm1, %xmm0
129; SSE2-NEXT:    paddq %xmm5, %xmm0
130; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
131; SSE2-NEXT:    paddq %xmm0, %xmm1
132; SSE2-NEXT:    movq %xmm1, %rax
133; SSE2-NEXT:    retq
134;
135; SSE41-LABEL: test_v8i64_v8i8:
136; SSE41:       # %bb.0:
137; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
138; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
139; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
140; SSE41-NEXT:    paddq %xmm1, %xmm2
141; SSE41-NEXT:    movdqa %xmm0, %xmm1
142; SSE41-NEXT:    psrlq $48, %xmm1
143; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
144; SSE41-NEXT:    psrld $16, %xmm0
145; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
146; SSE41-NEXT:    paddq %xmm1, %xmm0
147; SSE41-NEXT:    paddq %xmm2, %xmm0
148; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
149; SSE41-NEXT:    paddq %xmm0, %xmm1
150; SSE41-NEXT:    movq %xmm1, %rax
151; SSE41-NEXT:    retq
152;
153; AVX1-LABEL: test_v8i64_v8i8:
154; AVX1:       # %bb.0:
155; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
156; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
157; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
158; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm3
159; AVX1-NEXT:    vpmovsxbq %xmm3, %xmm3
160; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
161; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
162; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
163; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
164; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
165; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
166; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
167; AVX1-NEXT:    vmovq %xmm0, %rax
168; AVX1-NEXT:    retq
169;
170; AVX2-LABEL: test_v8i64_v8i8:
171; AVX2:       # %bb.0:
172; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
173; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
174; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
175; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
176; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
177; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
178; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
179; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
180; AVX2-NEXT:    vmovq %xmm0, %rax
181; AVX2-NEXT:    vzeroupper
182; AVX2-NEXT:    retq
183;
184; AVX512-LABEL: test_v8i64_v8i8:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
187; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
188; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
189; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
190; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
191; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
192; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
193; AVX512-NEXT:    vmovq %xmm0, %rax
194; AVX512-NEXT:    vzeroupper
195; AVX512-NEXT:    retq
196  %1 = sext <8 x i8> %a0 to <8 x i64>
197  %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1)
198  ret i64 %2
199}
200
201define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
202; SSE2-LABEL: test_v16i64_v16i8:
203; SSE2:       # %bb.0:
204; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
205; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
206; SSE2-NEXT:    psrad $24, %xmm2
207; SSE2-NEXT:    pxor %xmm1, %xmm1
208; SSE2-NEXT:    pxor %xmm3, %xmm3
209; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
210; SSE2-NEXT:    movdqa %xmm2, %xmm5
211; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
212; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
213; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
214; SSE2-NEXT:    psrad $24, %xmm0
215; SSE2-NEXT:    pxor %xmm7, %xmm7
216; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
217; SSE2-NEXT:    movdqa %xmm0, %xmm8
218; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
219; SSE2-NEXT:    paddq %xmm5, %xmm8
220; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
221; SSE2-NEXT:    psrad $24, %xmm4
222; SSE2-NEXT:    pxor %xmm5, %xmm5
223; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
224; SSE2-NEXT:    movdqa %xmm4, %xmm9
225; SSE2-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
226; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
227; SSE2-NEXT:    psrad $24, %xmm6
228; SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
229; SSE2-NEXT:    movdqa %xmm6, %xmm10
230; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
231; SSE2-NEXT:    paddq %xmm9, %xmm10
232; SSE2-NEXT:    paddq %xmm8, %xmm10
233; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
234; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
235; SSE2-NEXT:    paddq %xmm2, %xmm0
236; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
237; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
238; SSE2-NEXT:    paddq %xmm4, %xmm6
239; SSE2-NEXT:    paddq %xmm0, %xmm6
240; SSE2-NEXT:    paddq %xmm10, %xmm6
241; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
242; SSE2-NEXT:    paddq %xmm6, %xmm0
243; SSE2-NEXT:    movq %xmm0, %rax
244; SSE2-NEXT:    retq
245;
246; SSE41-LABEL: test_v16i64_v16i8:
247; SSE41:       # %bb.0:
248; SSE41-NEXT:    movdqa %xmm0, %xmm1
249; SSE41-NEXT:    movdqa %xmm0, %xmm2
250; SSE41-NEXT:    movdqa %xmm0, %xmm3
251; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
252; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
253; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
254; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
255; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
256; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
257; SSE41-NEXT:    psrld $16, %xmm1
258; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
259; SSE41-NEXT:    paddq %xmm0, %xmm1
260; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
261; SSE41-NEXT:    pmovsxbq %xmm2, %xmm0
262; SSE41-NEXT:    psrlq $48, %xmm3
263; SSE41-NEXT:    pmovsxbq %xmm3, %xmm2
264; SSE41-NEXT:    paddq %xmm0, %xmm2
265; SSE41-NEXT:    paddq %xmm1, %xmm2
266; SSE41-NEXT:    pmovsxbq %xmm5, %xmm0
267; SSE41-NEXT:    paddq %xmm4, %xmm0
268; SSE41-NEXT:    pmovsxbq %xmm6, %xmm1
269; SSE41-NEXT:    pmovsxbq %xmm7, %xmm3
270; SSE41-NEXT:    paddq %xmm1, %xmm3
271; SSE41-NEXT:    paddq %xmm0, %xmm3
272; SSE41-NEXT:    paddq %xmm2, %xmm3
273; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
274; SSE41-NEXT:    paddq %xmm3, %xmm0
275; SSE41-NEXT:    movq %xmm0, %rax
276; SSE41-NEXT:    retq
277;
278; AVX1-LABEL: test_v16i64_v16i8:
279; AVX1:       # %bb.0:
280; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
281; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
282; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
283; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
284; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
285; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
286; AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
287; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
288; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3]
289; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
290; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3]
291; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
292; AVX1-NEXT:    vpaddq %xmm6, %xmm5, %xmm5
293; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
294; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
295; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
296; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
297; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
298; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
299; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
300; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
301; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
302; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
303; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
304; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
305; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
306; AVX1-NEXT:    vmovq %xmm0, %rax
307; AVX1-NEXT:    retq
308;
309; AVX2-LABEL: test_v16i64_v16i8:
310; AVX2:       # %bb.0:
311; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
312; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
313; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
314; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
315; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
316; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
317; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
318; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
319; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
320; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
321; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
322; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
323; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
324; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
325; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
326; AVX2-NEXT:    vmovq %xmm0, %rax
327; AVX2-NEXT:    vzeroupper
328; AVX2-NEXT:    retq
329;
330; AVX512-LABEL: test_v16i64_v16i8:
331; AVX512:       # %bb.0:
332; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
333; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
334; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
335; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
336; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
337; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
338; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
339; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
340; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
341; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
342; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
343; AVX512-NEXT:    vmovq %xmm0, %rax
344; AVX512-NEXT:    vzeroupper
345; AVX512-NEXT:    retq
346  %1 = sext <16 x i8> %a0 to <16 x i64>
347  %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1)
348  ret i64 %2
349}
350
351;
352; vXi32
353;
354
355define i32 @test_v2i32_v2i16(<2 x i16> %a0) {
356; SSE2-LABEL: test_v2i32_v2i16:
357; SSE2:       # %bb.0:
358; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
359; SSE2-NEXT:    psrad $16, %xmm0
360; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
361; SSE2-NEXT:    paddd %xmm0, %xmm1
362; SSE2-NEXT:    movd %xmm1, %eax
363; SSE2-NEXT:    retq
364;
365; SSE41-LABEL: test_v2i32_v2i16:
366; SSE41:       # %bb.0:
367; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
368; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
369; SSE41-NEXT:    paddd %xmm0, %xmm1
370; SSE41-NEXT:    movd %xmm1, %eax
371; SSE41-NEXT:    retq
372;
373; AVX1-SLOW-LABEL: test_v2i32_v2i16:
374; AVX1-SLOW:       # %bb.0:
375; AVX1-SLOW-NEXT:    vpmovsxwd %xmm0, %xmm0
376; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
377; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
378; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
379; AVX1-SLOW-NEXT:    retq
380;
381; AVX1-FAST-LABEL: test_v2i32_v2i16:
382; AVX1-FAST:       # %bb.0:
383; AVX1-FAST-NEXT:    vpmovsxwd %xmm0, %xmm0
384; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
385; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
386; AVX1-FAST-NEXT:    retq
387;
388; AVX2-LABEL: test_v2i32_v2i16:
389; AVX2:       # %bb.0:
390; AVX2-NEXT:    vpmovsxwd %xmm0, %xmm0
391; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
392; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
393; AVX2-NEXT:    vmovd %xmm0, %eax
394; AVX2-NEXT:    retq
395;
396; AVX512-LABEL: test_v2i32_v2i16:
397; AVX512:       # %bb.0:
398; AVX512-NEXT:    vpmovsxwd %xmm0, %xmm0
399; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
400; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
401; AVX512-NEXT:    vmovd %xmm0, %eax
402; AVX512-NEXT:    retq
403  %1 = sext <2 x i16> %a0 to <2 x i32>
404  %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1)
405  ret i32 %2
406}
407
408define i32 @test_v4i32(<4 x i8> %a0) {
409; SSE2-LABEL: test_v4i32:
410; SSE2:       # %bb.0:
411; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
412; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
413; SSE2-NEXT:    psrad $24, %xmm0
414; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
415; SSE2-NEXT:    paddd %xmm0, %xmm1
416; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
417; SSE2-NEXT:    paddd %xmm1, %xmm0
418; SSE2-NEXT:    movd %xmm0, %eax
419; SSE2-NEXT:    retq
420;
421; SSE41-LABEL: test_v4i32:
422; SSE41:       # %bb.0:
423; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
424; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
425; SSE41-NEXT:    paddd %xmm0, %xmm1
426; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
427; SSE41-NEXT:    paddd %xmm1, %xmm0
428; SSE41-NEXT:    movd %xmm0, %eax
429; SSE41-NEXT:    retq
430;
431; AVX1-SLOW-LABEL: test_v4i32:
432; AVX1-SLOW:       # %bb.0:
433; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
434; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
435; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
436; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
437; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
438; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
439; AVX1-SLOW-NEXT:    retq
440;
441; AVX1-FAST-LABEL: test_v4i32:
442; AVX1-FAST:       # %bb.0:
443; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
444; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
445; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
446; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
447; AVX1-FAST-NEXT:    retq
448;
449; AVX2-LABEL: test_v4i32:
450; AVX2:       # %bb.0:
451; AVX2-NEXT:    vpmovsxbd %xmm0, %xmm0
452; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
453; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
454; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
455; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
456; AVX2-NEXT:    vmovd %xmm0, %eax
457; AVX2-NEXT:    retq
458;
459; AVX512-LABEL: test_v4i32:
460; AVX512:       # %bb.0:
461; AVX512-NEXT:    vpmovsxbd %xmm0, %xmm0
462; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
463; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
464; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
465; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
466; AVX512-NEXT:    vmovd %xmm0, %eax
467; AVX512-NEXT:    retq
468  %1 = sext <4 x i8> %a0 to <4 x i32>
469  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
470  ret i32 %2
471}
472
473define i32 @test_v8i32_v8i8(<8 x i8> %a0) {
474; SSE2-LABEL: test_v8i32_v8i8:
475; SSE2:       # %bb.0:
476; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
477; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
478; SSE2-NEXT:    psrad $24, %xmm1
479; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
480; SSE2-NEXT:    psrad $24, %xmm0
481; SSE2-NEXT:    paddd %xmm1, %xmm0
482; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
483; SSE2-NEXT:    paddd %xmm0, %xmm1
484; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
485; SSE2-NEXT:    paddd %xmm1, %xmm0
486; SSE2-NEXT:    movd %xmm0, %eax
487; SSE2-NEXT:    retq
488;
489; SSE41-LABEL: test_v8i32_v8i8:
490; SSE41:       # %bb.0:
491; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
492; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
493; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
494; SSE41-NEXT:    paddd %xmm1, %xmm0
495; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
496; SSE41-NEXT:    paddd %xmm0, %xmm1
497; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
498; SSE41-NEXT:    paddd %xmm1, %xmm0
499; SSE41-NEXT:    movd %xmm0, %eax
500; SSE41-NEXT:    retq
501;
502; AVX1-SLOW-LABEL: test_v8i32_v8i8:
503; AVX1-SLOW:       # %bb.0:
504; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm1
505; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
506; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
507; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
508; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
509; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
510; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
511; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
512; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
513; AVX1-SLOW-NEXT:    retq
514;
515; AVX1-FAST-LABEL: test_v8i32_v8i8:
516; AVX1-FAST:       # %bb.0:
517; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm1
518; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
519; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
520; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
521; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
522; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
523; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
524; AVX1-FAST-NEXT:    retq
525;
526; AVX2-LABEL: test_v8i32_v8i8:
527; AVX2:       # %bb.0:
528; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
529; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
530; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
531; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
532; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
533; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
534; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
535; AVX2-NEXT:    vmovd %xmm0, %eax
536; AVX2-NEXT:    vzeroupper
537; AVX2-NEXT:    retq
538;
539; AVX512-LABEL: test_v8i32_v8i8:
540; AVX512:       # %bb.0:
541; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
542; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
543; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
544; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
545; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
546; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
547; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
548; AVX512-NEXT:    vmovd %xmm0, %eax
549; AVX512-NEXT:    vzeroupper
550; AVX512-NEXT:    retq
551  %1 = sext <8 x i8> %a0 to <8 x i32>
552  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
553  ret i32 %2
554}
555
556define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
557; SSE2-LABEL: test_v16i32_v16i8:
558; SSE2:       # %bb.0:
559; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
560; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
561; SSE2-NEXT:    psrad $24, %xmm2
562; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
563; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
564; SSE2-NEXT:    psrad $24, %xmm3
565; SSE2-NEXT:    paddd %xmm2, %xmm3
566; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
567; SSE2-NEXT:    psrad $24, %xmm1
568; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
569; SSE2-NEXT:    psrad $24, %xmm0
570; SSE2-NEXT:    paddd %xmm1, %xmm0
571; SSE2-NEXT:    paddd %xmm3, %xmm0
572; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
573; SSE2-NEXT:    paddd %xmm0, %xmm1
574; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
575; SSE2-NEXT:    paddd %xmm1, %xmm0
576; SSE2-NEXT:    movd %xmm0, %eax
577; SSE2-NEXT:    retq
578;
579; SSE41-LABEL: test_v16i32_v16i8:
580; SSE41:       # %bb.0:
581; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
582; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
583; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
584; SSE41-NEXT:    paddd %xmm1, %xmm2
585; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
586; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
587; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
588; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
589; SSE41-NEXT:    paddd %xmm1, %xmm0
590; SSE41-NEXT:    paddd %xmm2, %xmm0
591; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
592; SSE41-NEXT:    paddd %xmm0, %xmm1
593; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
594; SSE41-NEXT:    paddd %xmm1, %xmm0
595; SSE41-NEXT:    movd %xmm0, %eax
596; SSE41-NEXT:    retq
597;
598; AVX1-SLOW-LABEL: test_v16i32_v16i8:
599; AVX1-SLOW:       # %bb.0:
600; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm1
601; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
602; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
603; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
604; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
605; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
606; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
607; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
608; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
609; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
610; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
611; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
612; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
613; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
614; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
615; AVX1-SLOW-NEXT:    retq
616;
617; AVX1-FAST-LABEL: test_v16i32_v16i8:
618; AVX1-FAST:       # %bb.0:
619; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm1
620; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
621; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
622; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
623; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
624; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
625; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
626; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
627; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
628; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
629; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
630; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
631; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
632; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
633; AVX1-FAST-NEXT:    retq
634;
635; AVX2-LABEL: test_v16i32_v16i8:
636; AVX2:       # %bb.0:
637; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
638; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
639; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
640; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
641; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
642; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
643; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
644; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
645; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
646; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
647; AVX2-NEXT:    vmovd %xmm0, %eax
648; AVX2-NEXT:    vzeroupper
649; AVX2-NEXT:    retq
650;
651; AVX512-LABEL: test_v16i32_v16i8:
652; AVX512:       # %bb.0:
653; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
654; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
655; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
656; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
657; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
658; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
659; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
660; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
661; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
662; AVX512-NEXT:    vmovd %xmm0, %eax
663; AVX512-NEXT:    vzeroupper
664; AVX512-NEXT:    retq
665  %1 = sext <16 x i8> %a0 to <16 x i32>
666  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
667  ret i32 %2
668}
669
670define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
671; SSE2-LABEL: test_v32i32_v32i8:
672; SSE2:       # %bb.0:
673; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
674; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
675; SSE2-NEXT:    psrad $24, %xmm3
676; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
677; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
678; SSE2-NEXT:    psrad $24, %xmm5
679; SSE2-NEXT:    paddd %xmm3, %xmm5
680; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
681; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
682; SSE2-NEXT:    psrad $24, %xmm3
683; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
684; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
685; SSE2-NEXT:    psrad $24, %xmm6
686; SSE2-NEXT:    paddd %xmm3, %xmm6
687; SSE2-NEXT:    paddd %xmm5, %xmm6
688; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
689; SSE2-NEXT:    psrad $24, %xmm2
690; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
691; SSE2-NEXT:    psrad $24, %xmm3
692; SSE2-NEXT:    paddd %xmm2, %xmm3
693; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
694; SSE2-NEXT:    psrad $24, %xmm1
695; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
696; SSE2-NEXT:    psrad $24, %xmm0
697; SSE2-NEXT:    paddd %xmm1, %xmm0
698; SSE2-NEXT:    paddd %xmm3, %xmm0
699; SSE2-NEXT:    paddd %xmm6, %xmm0
700; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
701; SSE2-NEXT:    paddd %xmm0, %xmm1
702; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
703; SSE2-NEXT:    paddd %xmm1, %xmm0
704; SSE2-NEXT:    movd %xmm0, %eax
705; SSE2-NEXT:    retq
706;
707; SSE41-LABEL: test_v32i32_v32i8:
708; SSE41:       # %bb.0:
709; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
710; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
711; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
712; SSE41-NEXT:    pmovsxbd %xmm3, %xmm3
713; SSE41-NEXT:    paddd %xmm2, %xmm3
714; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
715; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
716; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
717; SSE41-NEXT:    pmovsxbd %xmm4, %xmm4
718; SSE41-NEXT:    paddd %xmm2, %xmm4
719; SSE41-NEXT:    paddd %xmm3, %xmm4
720; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
721; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
722; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
723; SSE41-NEXT:    pmovsxbd %xmm3, %xmm3
724; SSE41-NEXT:    paddd %xmm2, %xmm3
725; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
726; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
727; SSE41-NEXT:    paddd %xmm1, %xmm0
728; SSE41-NEXT:    paddd %xmm3, %xmm0
729; SSE41-NEXT:    paddd %xmm4, %xmm0
730; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
731; SSE41-NEXT:    paddd %xmm0, %xmm1
732; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
733; SSE41-NEXT:    paddd %xmm1, %xmm0
734; SSE41-NEXT:    movd %xmm0, %eax
735; SSE41-NEXT:    retq
736;
737; AVX1-SLOW-LABEL: test_v32i32_v32i8:
738; AVX1-SLOW:       # %bb.0:
739; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
740; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
741; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
742; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
743; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
744; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
745; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
746; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
747; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
748; AVX1-SLOW-NEXT:    vpmovsxbd %xmm4, %xmm4
749; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
750; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
751; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
752; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
753; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
754; AVX1-SLOW-NEXT:    vpmovsxbd %xmm4, %xmm4
755; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
756; AVX1-SLOW-NEXT:    vpmovsxbd %xmm1, %xmm1
757; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
758; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
759; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
760; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
761; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
762; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
763; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
764; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
765; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
766; AVX1-SLOW-NEXT:    vzeroupper
767; AVX1-SLOW-NEXT:    retq
768;
769; AVX1-FAST-LABEL: test_v32i32_v32i8:
770; AVX1-FAST:       # %bb.0:
771; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
772; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
773; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
774; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
775; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
776; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
777; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
778; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
779; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
780; AVX1-FAST-NEXT:    vpmovsxbd %xmm4, %xmm4
781; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
782; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
783; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
784; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
785; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
786; AVX1-FAST-NEXT:    vpmovsxbd %xmm4, %xmm4
787; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
788; AVX1-FAST-NEXT:    vpmovsxbd %xmm1, %xmm1
789; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
790; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
791; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
792; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
793; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
794; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
795; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
796; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
797; AVX1-FAST-NEXT:    vzeroupper
798; AVX1-FAST-NEXT:    retq
799;
800; AVX2-LABEL: test_v32i32_v32i8:
801; AVX2:       # %bb.0:
802; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
803; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
804; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
805; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
806; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
807; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
808; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
809; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
810; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
811; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
812; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
813; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
814; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
815; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
816; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
817; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
818; AVX2-NEXT:    vmovd %xmm0, %eax
819; AVX2-NEXT:    vzeroupper
820; AVX2-NEXT:    retq
821;
822; AVX512-LABEL: test_v32i32_v32i8:
823; AVX512:       # %bb.0:
824; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
825; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
826; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
827; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
828; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
829; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
830; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
831; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
832; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
833; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
834; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
835; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
836; AVX512-NEXT:    vmovd %xmm0, %eax
837; AVX512-NEXT:    vzeroupper
838; AVX512-NEXT:    retq
839  %1 = sext <32 x i8> %a0 to <32 x i32>
840  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
841  ret i32 %2
842}
843
844;
845; vXi16
846;
847
848define i16 @test_v2i16_v2i8(<2 x i8> %a0) {
849; SSE2-LABEL: test_v2i16_v2i8:
850; SSE2:       # %bb.0:
851; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
852; SSE2-NEXT:    psraw $8, %xmm0
853; SSE2-NEXT:    movdqa %xmm0, %xmm1
854; SSE2-NEXT:    psrld $16, %xmm1
855; SSE2-NEXT:    paddw %xmm0, %xmm1
856; SSE2-NEXT:    movd %xmm1, %eax
857; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
858; SSE2-NEXT:    retq
859;
860; SSE41-LABEL: test_v2i16_v2i8:
861; SSE41:       # %bb.0:
862; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
863; SSE41-NEXT:    movdqa %xmm0, %xmm1
864; SSE41-NEXT:    psrld $16, %xmm1
865; SSE41-NEXT:    paddw %xmm0, %xmm1
866; SSE41-NEXT:    movd %xmm1, %eax
867; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
868; SSE41-NEXT:    retq
869;
870; AVX1-SLOW-LABEL: test_v2i16_v2i8:
871; AVX1-SLOW:       # %bb.0:
872; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
873; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
874; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
875; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
876; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
877; AVX1-SLOW-NEXT:    retq
878;
879; AVX1-FAST-LABEL: test_v2i16_v2i8:
880; AVX1-FAST:       # %bb.0:
881; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
882; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
883; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
884; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
885; AVX1-FAST-NEXT:    retq
886;
887; AVX2-LABEL: test_v2i16_v2i8:
888; AVX2:       # %bb.0:
889; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
890; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
891; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
892; AVX2-NEXT:    vmovd %xmm0, %eax
893; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
894; AVX2-NEXT:    retq
895;
896; AVX512-LABEL: test_v2i16_v2i8:
897; AVX512:       # %bb.0:
898; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
899; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
900; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
901; AVX512-NEXT:    vmovd %xmm0, %eax
902; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
903; AVX512-NEXT:    retq
904  %1 = sext <2 x i8> %a0 to <2 x i16>
905  %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1)
906  ret i16 %2
907}
908
909define i16 @test_v4i16_v4i8(<4 x i8> %a0) {
910; SSE2-LABEL: test_v4i16_v4i8:
911; SSE2:       # %bb.0:
912; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
913; SSE2-NEXT:    psraw $8, %xmm0
914; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
915; SSE2-NEXT:    paddw %xmm0, %xmm1
916; SSE2-NEXT:    movdqa %xmm1, %xmm0
917; SSE2-NEXT:    psrld $16, %xmm0
918; SSE2-NEXT:    paddw %xmm1, %xmm0
919; SSE2-NEXT:    movd %xmm0, %eax
920; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
921; SSE2-NEXT:    retq
922;
923; SSE41-LABEL: test_v4i16_v4i8:
924; SSE41:       # %bb.0:
925; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
926; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
927; SSE41-NEXT:    paddw %xmm0, %xmm1
928; SSE41-NEXT:    movdqa %xmm1, %xmm0
929; SSE41-NEXT:    psrld $16, %xmm0
930; SSE41-NEXT:    paddw %xmm1, %xmm0
931; SSE41-NEXT:    movd %xmm0, %eax
932; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
933; SSE41-NEXT:    retq
934;
935; AVX1-SLOW-LABEL: test_v4i16_v4i8:
936; AVX1-SLOW:       # %bb.0:
937; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
938; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
939; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
940; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
941; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
942; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
943; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
944; AVX1-SLOW-NEXT:    retq
945;
946; AVX1-FAST-LABEL: test_v4i16_v4i8:
947; AVX1-FAST:       # %bb.0:
948; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
949; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
950; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
951; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
952; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
953; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
954; AVX1-FAST-NEXT:    retq
955;
956; AVX2-LABEL: test_v4i16_v4i8:
957; AVX2:       # %bb.0:
958; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
959; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
960; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
961; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
962; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
963; AVX2-NEXT:    vmovd %xmm0, %eax
964; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
965; AVX2-NEXT:    retq
966;
967; AVX512-LABEL: test_v4i16_v4i8:
968; AVX512:       # %bb.0:
969; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
970; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
971; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
972; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
973; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
974; AVX512-NEXT:    vmovd %xmm0, %eax
975; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
976; AVX512-NEXT:    retq
977  %1 = sext <4 x i8> %a0 to <4 x i16>
978  %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
979  ret i16 %2
980
981}
982
983define i16 @test_v8i16_v8i8(<8 x i8> %a0) {
984; SSE2-LABEL: test_v8i16_v8i8:
985; SSE2:       # %bb.0:
986; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987; SSE2-NEXT:    psraw $8, %xmm0
988; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
989; SSE2-NEXT:    paddw %xmm0, %xmm1
990; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
991; SSE2-NEXT:    paddw %xmm1, %xmm0
992; SSE2-NEXT:    movdqa %xmm0, %xmm1
993; SSE2-NEXT:    psrld $16, %xmm1
994; SSE2-NEXT:    paddw %xmm0, %xmm1
995; SSE2-NEXT:    movd %xmm1, %eax
996; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
997; SSE2-NEXT:    retq
998;
999; SSE41-LABEL: test_v8i16_v8i8:
1000; SSE41:       # %bb.0:
1001; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
1002; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1003; SSE41-NEXT:    paddw %xmm0, %xmm1
1004; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1005; SSE41-NEXT:    paddw %xmm1, %xmm0
1006; SSE41-NEXT:    movdqa %xmm0, %xmm1
1007; SSE41-NEXT:    psrld $16, %xmm1
1008; SSE41-NEXT:    paddw %xmm0, %xmm1
1009; SSE41-NEXT:    movd %xmm1, %eax
1010; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1011; SSE41-NEXT:    retq
1012;
1013; AVX1-SLOW-LABEL: test_v8i16_v8i8:
1014; AVX1-SLOW:       # %bb.0:
1015; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
1016; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1017; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1018; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1019; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1020; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1021; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1022; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1023; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1024; AVX1-SLOW-NEXT:    retq
1025;
1026; AVX1-FAST-LABEL: test_v8i16_v8i8:
1027; AVX1-FAST:       # %bb.0:
1028; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
1029; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1030; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1031; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1032; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1033; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1034; AVX1-FAST-NEXT:    retq
1035;
1036; AVX2-LABEL: test_v8i16_v8i8:
1037; AVX2:       # %bb.0:
1038; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
1039; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1040; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1041; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1042; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1043; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1044; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1045; AVX2-NEXT:    vmovd %xmm0, %eax
1046; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1047; AVX2-NEXT:    retq
1048;
1049; AVX512-LABEL: test_v8i16_v8i8:
1050; AVX512:       # %bb.0:
1051; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
1052; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1053; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1054; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1055; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1056; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1057; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1058; AVX512-NEXT:    vmovd %xmm0, %eax
1059; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1060; AVX512-NEXT:    retq
1061  %1 = sext <8 x i8> %a0 to <8 x i16>
1062  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
1063  ret i16 %2
1064}
1065
1066define i16 @test_v16i16_v16i8(<16 x i8> %a0) {
1067; SSE2-LABEL: test_v16i16_v16i8:
1068; SSE2:       # %bb.0:
1069; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1070; SSE2-NEXT:    psraw $8, %xmm1
1071; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1072; SSE2-NEXT:    psraw $8, %xmm0
1073; SSE2-NEXT:    paddw %xmm1, %xmm0
1074; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1075; SSE2-NEXT:    paddw %xmm0, %xmm1
1076; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1077; SSE2-NEXT:    paddw %xmm1, %xmm0
1078; SSE2-NEXT:    movdqa %xmm0, %xmm1
1079; SSE2-NEXT:    psrld $16, %xmm1
1080; SSE2-NEXT:    paddw %xmm0, %xmm1
1081; SSE2-NEXT:    movd %xmm1, %eax
1082; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1083; SSE2-NEXT:    retq
1084;
1085; SSE41-LABEL: test_v16i16_v16i8:
1086; SSE41:       # %bb.0:
1087; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
1088; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1089; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
1090; SSE41-NEXT:    paddw %xmm1, %xmm0
1091; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1092; SSE41-NEXT:    paddw %xmm0, %xmm1
1093; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1094; SSE41-NEXT:    paddw %xmm1, %xmm0
1095; SSE41-NEXT:    movdqa %xmm0, %xmm1
1096; SSE41-NEXT:    psrld $16, %xmm1
1097; SSE41-NEXT:    paddw %xmm0, %xmm1
1098; SSE41-NEXT:    movd %xmm1, %eax
1099; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1100; SSE41-NEXT:    retq
1101;
1102; AVX1-SLOW-LABEL: test_v16i16_v16i8:
1103; AVX1-SLOW:       # %bb.0:
1104; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm1
1105; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1106; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
1107; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1108; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1109; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1110; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1111; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1112; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1113; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1114; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1115; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1116; AVX1-SLOW-NEXT:    retq
1117;
1118; AVX1-FAST-LABEL: test_v16i16_v16i8:
1119; AVX1-FAST:       # %bb.0:
1120; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm1
1121; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1122; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
1123; AVX1-FAST-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
1124; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1125; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1126; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1127; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1128; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1129; AVX1-FAST-NEXT:    retq
1130;
1131; AVX2-LABEL: test_v16i16_v16i8:
1132; AVX2:       # %bb.0:
1133; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1134; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1135; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1136; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1137; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1138; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1139; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1140; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1141; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1142; AVX2-NEXT:    vmovd %xmm0, %eax
1143; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1144; AVX2-NEXT:    vzeroupper
1145; AVX2-NEXT:    retq
1146;
1147; AVX512-LABEL: test_v16i16_v16i8:
1148; AVX512:       # %bb.0:
1149; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
1150; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1151; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1152; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1153; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1154; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1155; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1156; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1157; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1158; AVX512-NEXT:    vmovd %xmm0, %eax
1159; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1160; AVX512-NEXT:    vzeroupper
1161; AVX512-NEXT:    retq
1162  %1 = sext <16 x i8> %a0 to <16 x i16>
1163  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
1164  ret i16 %2
1165}
1166
1167define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
1168; SSE2-LABEL: test_v32i16_v32i8:
1169; SSE2:       # %bb.0:
1170; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1171; SSE2-NEXT:    psraw $8, %xmm2
1172; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1173; SSE2-NEXT:    psraw $8, %xmm3
1174; SSE2-NEXT:    paddw %xmm2, %xmm3
1175; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1176; SSE2-NEXT:    psraw $8, %xmm1
1177; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1178; SSE2-NEXT:    psraw $8, %xmm0
1179; SSE2-NEXT:    paddw %xmm1, %xmm0
1180; SSE2-NEXT:    paddw %xmm3, %xmm0
1181; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1182; SSE2-NEXT:    paddw %xmm0, %xmm1
1183; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1184; SSE2-NEXT:    paddw %xmm1, %xmm0
1185; SSE2-NEXT:    movdqa %xmm0, %xmm1
1186; SSE2-NEXT:    psrld $16, %xmm1
1187; SSE2-NEXT:    paddw %xmm0, %xmm1
1188; SSE2-NEXT:    movd %xmm1, %eax
1189; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1190; SSE2-NEXT:    retq
1191;
1192; SSE41-LABEL: test_v32i16_v32i8:
1193; SSE41:       # %bb.0:
1194; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1195; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1196; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1197; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
1198; SSE41-NEXT:    paddw %xmm2, %xmm3
1199; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
1200; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
1201; SSE41-NEXT:    paddw %xmm1, %xmm0
1202; SSE41-NEXT:    paddw %xmm3, %xmm0
1203; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1204; SSE41-NEXT:    paddw %xmm0, %xmm1
1205; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1206; SSE41-NEXT:    paddw %xmm1, %xmm0
1207; SSE41-NEXT:    movdqa %xmm0, %xmm1
1208; SSE41-NEXT:    psrld $16, %xmm1
1209; SSE41-NEXT:    paddw %xmm0, %xmm1
1210; SSE41-NEXT:    movd %xmm1, %eax
1211; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1212; SSE41-NEXT:    retq
1213;
1214; AVX1-SLOW-LABEL: test_v32i16_v32i8:
1215; AVX1-SLOW:       # %bb.0:
1216; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1217; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1218; AVX1-SLOW-NEXT:    vpmovsxbw %xmm2, %xmm2
1219; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1220; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm3
1221; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1222; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
1223; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
1224; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1225; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
1226; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1227; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1228; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1229; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1230; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1231; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1232; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1233; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1234; AVX1-SLOW-NEXT:    vzeroupper
1235; AVX1-SLOW-NEXT:    retq
1236;
1237; AVX1-FAST-LABEL: test_v32i16_v32i8:
1238; AVX1-FAST:       # %bb.0:
1239; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1240; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1241; AVX1-FAST-NEXT:    vpmovsxbw %xmm2, %xmm2
1242; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1243; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm3
1244; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1245; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
1246; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
1247; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1248; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
1249; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1250; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1251; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1252; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1253; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1254; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1255; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1256; AVX1-FAST-NEXT:    vzeroupper
1257; AVX1-FAST-NEXT:    retq
1258;
1259; AVX2-LABEL: test_v32i16_v32i8:
1260; AVX2:       # %bb.0:
1261; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1262; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1263; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1264; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1265; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1266; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1267; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1268; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1269; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1270; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1271; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1272; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1273; AVX2-NEXT:    vmovd %xmm0, %eax
1274; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1275; AVX2-NEXT:    vzeroupper
1276; AVX2-NEXT:    retq
1277;
1278; AVX512-LABEL: test_v32i16_v32i8:
1279; AVX512:       # %bb.0:
1280; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
1281; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1282; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
1283; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1284; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1285; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1286; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1287; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1288; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1289; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1290; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1291; AVX512-NEXT:    vmovd %xmm0, %eax
1292; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1293; AVX512-NEXT:    vzeroupper
1294; AVX512-NEXT:    retq
1295  %1 = sext <32 x i8> %a0 to <32 x i16>
1296  %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
1297  ret i16 %2
1298}
1299
1300define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
1301; SSE2-LABEL: test_v64i16_v64i8:
1302; SSE2:       # %bb.0:
1303; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
1304; SSE2-NEXT:    psraw $8, %xmm4
1305; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
1306; SSE2-NEXT:    psraw $8, %xmm5
1307; SSE2-NEXT:    paddw %xmm4, %xmm5
1308; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1309; SSE2-NEXT:    psraw $8, %xmm4
1310; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
1311; SSE2-NEXT:    psraw $8, %xmm6
1312; SSE2-NEXT:    paddw %xmm4, %xmm6
1313; SSE2-NEXT:    paddw %xmm5, %xmm6
1314; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1315; SSE2-NEXT:    psraw $8, %xmm2
1316; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1317; SSE2-NEXT:    psraw $8, %xmm0
1318; SSE2-NEXT:    paddw %xmm2, %xmm0
1319; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1320; SSE2-NEXT:    psraw $8, %xmm2
1321; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1322; SSE2-NEXT:    psraw $8, %xmm1
1323; SSE2-NEXT:    paddw %xmm2, %xmm1
1324; SSE2-NEXT:    paddw %xmm0, %xmm1
1325; SSE2-NEXT:    paddw %xmm6, %xmm1
1326; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1327; SSE2-NEXT:    paddw %xmm1, %xmm0
1328; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1329; SSE2-NEXT:    paddw %xmm0, %xmm1
1330; SSE2-NEXT:    movdqa %xmm1, %xmm0
1331; SSE2-NEXT:    psrld $16, %xmm0
1332; SSE2-NEXT:    paddw %xmm1, %xmm0
1333; SSE2-NEXT:    movd %xmm0, %eax
1334; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1335; SSE2-NEXT:    retq
1336;
1337; SSE41-LABEL: test_v64i16_v64i8:
1338; SSE41:       # %bb.0:
1339; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
1340; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
1341; SSE41-NEXT:    paddw %xmm4, %xmm5
1342; SSE41-NEXT:    pmovsxbw %xmm3, %xmm4
1343; SSE41-NEXT:    pmovsxbw %xmm1, %xmm6
1344; SSE41-NEXT:    paddw %xmm4, %xmm6
1345; SSE41-NEXT:    paddw %xmm5, %xmm6
1346; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1347; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1348; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1349; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
1350; SSE41-NEXT:    paddw %xmm2, %xmm0
1351; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1352; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1353; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1354; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
1355; SSE41-NEXT:    paddw %xmm2, %xmm1
1356; SSE41-NEXT:    paddw %xmm0, %xmm1
1357; SSE41-NEXT:    paddw %xmm6, %xmm1
1358; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1359; SSE41-NEXT:    paddw %xmm1, %xmm0
1360; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1361; SSE41-NEXT:    paddw %xmm0, %xmm1
1362; SSE41-NEXT:    movdqa %xmm1, %xmm0
1363; SSE41-NEXT:    psrld $16, %xmm0
1364; SSE41-NEXT:    paddw %xmm1, %xmm0
1365; SSE41-NEXT:    movd %xmm0, %eax
1366; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1367; SSE41-NEXT:    retq
1368;
1369; AVX1-SLOW-LABEL: test_v64i16_v64i8:
1370; AVX1-SLOW:       # %bb.0:
1371; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm2
1372; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm3
1373; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1374; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
1375; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm4
1376; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm5
1377; AVX1-SLOW-NEXT:    vpmovsxbw %xmm5, %xmm6
1378; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm6, %xmm4
1379; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
1380; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1381; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
1382; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1383; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
1384; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1385; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
1386; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
1387; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
1388; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm3
1389; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
1390; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1391; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
1392; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1393; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1394; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1395; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1396; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1397; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1398; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1399; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1400; AVX1-SLOW-NEXT:    vzeroupper
1401; AVX1-SLOW-NEXT:    retq
1402;
1403; AVX1-FAST-LABEL: test_v64i16_v64i8:
1404; AVX1-FAST:       # %bb.0:
1405; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm2
1406; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm3
1407; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1408; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm3
1409; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm4
1410; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm5
1411; AVX1-FAST-NEXT:    vpmovsxbw %xmm5, %xmm6
1412; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm6, %xmm4
1413; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
1414; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1415; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
1416; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1417; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
1418; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1419; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
1420; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
1421; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
1422; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm3
1423; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
1424; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1425; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
1426; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1427; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1428; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1429; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1430; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1431; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1432; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1433; AVX1-FAST-NEXT:    vzeroupper
1434; AVX1-FAST-NEXT:    retq
1435;
1436; AVX2-LABEL: test_v64i16_v64i8:
1437; AVX2:       # %bb.0:
1438; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm2
1439; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
1440; AVX2-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
1441; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1442; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1443; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1444; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1445; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1446; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
1447; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1448; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1449; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1450; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1451; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1452; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1453; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1454; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1455; AVX2-NEXT:    vmovd %xmm0, %eax
1456; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1457; AVX2-NEXT:    vzeroupper
1458; AVX2-NEXT:    retq
1459;
1460; AVX512-LABEL: test_v64i16_v64i8:
1461; AVX512:       # %bb.0:
1462; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1463; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
1464; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
1465; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
1466; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1467; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
1468; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1469; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1470; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1471; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1472; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1473; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1474; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1475; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1476; AVX512-NEXT:    vmovd %xmm0, %eax
1477; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1478; AVX512-NEXT:    vzeroupper
1479; AVX512-NEXT:    retq
1480  %1 = sext <64 x i8> %a0 to <64 x i16>
1481  %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
1482  ret i16 %2
1483
1484}
1485
1486;
1487; vXi1 - sum of extended bool vectors
1488;
1489
1490define i64 @test_v2i64_v2i1(<2 x i64> %a0) {
1491; SSE2-LABEL: test_v2i64_v2i1:
1492; SSE2:       # %bb.0:
1493; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1494; SSE2-NEXT:    pxor %xmm1, %xmm1
1495; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1496; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1497; SSE2-NEXT:    paddq %xmm1, %xmm0
1498; SSE2-NEXT:    movq %xmm0, %rax
1499; SSE2-NEXT:    retq
1500;
1501; SSE41-LABEL: test_v2i64_v2i1:
1502; SSE41:       # %bb.0:
1503; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1504; SSE41-NEXT:    pxor %xmm1, %xmm1
1505; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
1506; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1507; SSE41-NEXT:    paddq %xmm1, %xmm0
1508; SSE41-NEXT:    movq %xmm0, %rax
1509; SSE41-NEXT:    retq
1510;
1511; AVX-LABEL: test_v2i64_v2i1:
1512; AVX:       # %bb.0:
1513; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1514; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
1515; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1516; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1517; AVX-NEXT:    vmovq %xmm0, %rax
1518; AVX-NEXT:    retq
1519  %1 = icmp slt <2 x i64> %a0, zeroinitializer
1520  %2 = sext <2 x i1> %1 to <2 x i64>
1521  %3 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %2)
1522  ret i64 %3
1523}
1524
1525define i32 @test_v4i32_v4i1(<4 x i32> %a0) {
1526; SSE2-LABEL: test_v4i32_v4i1:
1527; SSE2:       # %bb.0:
1528; SSE2-NEXT:    pxor %xmm1, %xmm1
1529; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1530; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1531; SSE2-NEXT:    paddd %xmm1, %xmm0
1532; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1533; SSE2-NEXT:    paddd %xmm0, %xmm1
1534; SSE2-NEXT:    movd %xmm1, %eax
1535; SSE2-NEXT:    retq
1536;
1537; SSE41-LABEL: test_v4i32_v4i1:
1538; SSE41:       # %bb.0:
1539; SSE41-NEXT:    pxor %xmm1, %xmm1
1540; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
1541; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1542; SSE41-NEXT:    paddd %xmm1, %xmm0
1543; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1544; SSE41-NEXT:    paddd %xmm0, %xmm1
1545; SSE41-NEXT:    movd %xmm1, %eax
1546; SSE41-NEXT:    retq
1547;
1548; AVX1-SLOW-LABEL: test_v4i32_v4i1:
1549; AVX1-SLOW:       # %bb.0:
1550; AVX1-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1551; AVX1-SLOW-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
1552; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1553; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1554; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1555; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1556; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1557; AVX1-SLOW-NEXT:    retq
1558;
1559; AVX1-FAST-LABEL: test_v4i32_v4i1:
1560; AVX1-FAST:       # %bb.0:
1561; AVX1-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1562; AVX1-FAST-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
1563; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1564; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1565; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1566; AVX1-FAST-NEXT:    retq
1567;
1568; AVX2-LABEL: test_v4i32_v4i1:
1569; AVX2:       # %bb.0:
1570; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1571; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
1572; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1573; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1574; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1575; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1576; AVX2-NEXT:    vmovd %xmm0, %eax
1577; AVX2-NEXT:    retq
1578;
1579; AVX512-LABEL: test_v4i32_v4i1:
1580; AVX512:       # %bb.0:
1581; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1582; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
1583; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1584; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1585; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1586; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1587; AVX512-NEXT:    vmovd %xmm0, %eax
1588; AVX512-NEXT:    retq
1589  %1 = icmp slt <4 x i32> %a0, zeroinitializer
1590  %2 = sext <4 x i1> %1 to <4 x i32>
1591  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1592  ret i32 %3
1593}
1594
1595define i16 @test_v8i16_v8i1(<8 x i16> %a0) {
1596; SSE2-LABEL: test_v8i16_v8i1:
1597; SSE2:       # %bb.0:
1598; SSE2-NEXT:    pxor %xmm1, %xmm1
1599; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
1600; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1601; SSE2-NEXT:    paddw %xmm1, %xmm0
1602; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1603; SSE2-NEXT:    paddw %xmm0, %xmm1
1604; SSE2-NEXT:    movdqa %xmm1, %xmm0
1605; SSE2-NEXT:    psrld $16, %xmm0
1606; SSE2-NEXT:    paddw %xmm1, %xmm0
1607; SSE2-NEXT:    movd %xmm0, %eax
1608; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1609; SSE2-NEXT:    retq
1610;
1611; SSE41-LABEL: test_v8i16_v8i1:
1612; SSE41:       # %bb.0:
1613; SSE41-NEXT:    pxor %xmm1, %xmm1
1614; SSE41-NEXT:    pcmpgtw %xmm0, %xmm1
1615; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1616; SSE41-NEXT:    paddw %xmm1, %xmm0
1617; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1618; SSE41-NEXT:    paddw %xmm0, %xmm1
1619; SSE41-NEXT:    movdqa %xmm1, %xmm0
1620; SSE41-NEXT:    psrld $16, %xmm0
1621; SSE41-NEXT:    paddw %xmm1, %xmm0
1622; SSE41-NEXT:    movd %xmm0, %eax
1623; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1624; SSE41-NEXT:    retq
1625;
1626; AVX1-SLOW-LABEL: test_v8i16_v8i1:
1627; AVX1-SLOW:       # %bb.0:
1628; AVX1-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1629; AVX1-SLOW-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
1630; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1631; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1632; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1633; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1634; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1635; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1636; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
1637; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1638; AVX1-SLOW-NEXT:    retq
1639;
1640; AVX1-FAST-LABEL: test_v8i16_v8i1:
1641; AVX1-FAST:       # %bb.0:
1642; AVX1-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1643; AVX1-FAST-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
1644; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1645; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1646; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1647; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1648; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1649; AVX1-FAST-NEXT:    retq
1650;
1651; AVX2-LABEL: test_v8i16_v8i1:
1652; AVX2:       # %bb.0:
1653; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1654; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
1655; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1656; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1657; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1658; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1659; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1660; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1661; AVX2-NEXT:    vmovd %xmm0, %eax
1662; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1663; AVX2-NEXT:    retq
1664;
1665; AVX512-LABEL: test_v8i16_v8i1:
1666; AVX512:       # %bb.0:
1667; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1668; AVX512-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
1669; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1670; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1671; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1672; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1673; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1674; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1675; AVX512-NEXT:    vmovd %xmm0, %eax
1676; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1677; AVX512-NEXT:    retq
1678  %1 = icmp slt <8 x i16> %a0, zeroinitializer
1679  %2 = sext <8 x i1> %1 to <8 x i16>
1680  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
1681  ret i16 %3
1682}
1683
1684define i8 @test_v16i8_v16i1(<16 x i8> %a0) {
1685; SSE2-LABEL: test_v16i8_v16i1:
1686; SSE2:       # %bb.0:
1687; SSE2-NEXT:    pxor %xmm1, %xmm1
1688; SSE2-NEXT:    pxor %xmm2, %xmm2
1689; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
1690; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1691; SSE2-NEXT:    paddb %xmm2, %xmm0
1692; SSE2-NEXT:    psadbw %xmm1, %xmm0
1693; SSE2-NEXT:    movd %xmm0, %eax
1694; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1695; SSE2-NEXT:    retq
1696;
1697; SSE41-LABEL: test_v16i8_v16i1:
1698; SSE41:       # %bb.0:
1699; SSE41-NEXT:    pxor %xmm1, %xmm1
1700; SSE41-NEXT:    pxor %xmm2, %xmm2
1701; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
1702; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1703; SSE41-NEXT:    paddb %xmm2, %xmm0
1704; SSE41-NEXT:    psadbw %xmm1, %xmm0
1705; SSE41-NEXT:    movd %xmm0, %eax
1706; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1707; SSE41-NEXT:    retq
1708;
1709; AVX-LABEL: test_v16i8_v16i1:
1710; AVX:       # %bb.0:
1711; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1712; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
1713; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1714; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1715; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1716; AVX-NEXT:    vmovd %xmm0, %eax
1717; AVX-NEXT:    # kill: def $al killed $al killed $eax
1718; AVX-NEXT:    retq
1719  %1 = icmp slt <16 x i8> %a0, zeroinitializer
1720  %2 = sext <16 x i1> %1 to <16 x i8>
1721  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
1722  ret i8 %3
1723}
1724
1725define i8 @test_v32i8_v32i1(<32 x i8> %a0) {
1726; SSE2-LABEL: test_v32i8_v32i1:
1727; SSE2:       # %bb.0:
1728; SSE2-NEXT:    pxor %xmm2, %xmm2
1729; SSE2-NEXT:    pxor %xmm3, %xmm3
1730; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
1731; SSE2-NEXT:    pxor %xmm1, %xmm1
1732; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
1733; SSE2-NEXT:    paddb %xmm3, %xmm1
1734; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1735; SSE2-NEXT:    paddb %xmm1, %xmm0
1736; SSE2-NEXT:    psadbw %xmm2, %xmm0
1737; SSE2-NEXT:    movd %xmm0, %eax
1738; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1739; SSE2-NEXT:    retq
1740;
1741; SSE41-LABEL: test_v32i8_v32i1:
1742; SSE41:       # %bb.0:
1743; SSE41-NEXT:    pxor %xmm2, %xmm2
1744; SSE41-NEXT:    pxor %xmm3, %xmm3
1745; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
1746; SSE41-NEXT:    pxor %xmm1, %xmm1
1747; SSE41-NEXT:    pcmpgtb %xmm0, %xmm1
1748; SSE41-NEXT:    paddb %xmm3, %xmm1
1749; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1750; SSE41-NEXT:    paddb %xmm1, %xmm0
1751; SSE41-NEXT:    psadbw %xmm2, %xmm0
1752; SSE41-NEXT:    movd %xmm0, %eax
1753; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1754; SSE41-NEXT:    retq
1755;
1756; AVX1-LABEL: test_v32i8_v32i1:
1757; AVX1:       # %bb.0:
1758; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1759; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1760; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
1761; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
1762; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1763; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1764; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1765; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
1766; AVX1-NEXT:    vmovd %xmm0, %eax
1767; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1768; AVX1-NEXT:    vzeroupper
1769; AVX1-NEXT:    retq
1770;
1771; AVX2-LABEL: test_v32i8_v32i1:
1772; AVX2:       # %bb.0:
1773; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1774; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
1775; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1776; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1777; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1778; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1779; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1780; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1781; AVX2-NEXT:    vmovd %xmm0, %eax
1782; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1783; AVX2-NEXT:    vzeroupper
1784; AVX2-NEXT:    retq
1785;
1786; AVX512-LABEL: test_v32i8_v32i1:
1787; AVX512:       # %bb.0:
1788; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1789; AVX512-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
1790; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1791; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1792; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1793; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1794; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1795; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1796; AVX512-NEXT:    vmovd %xmm0, %eax
1797; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1798; AVX512-NEXT:    vzeroupper
1799; AVX512-NEXT:    retq
1800  %1 = icmp slt <32 x i8> %a0, zeroinitializer
1801  %2 = sext <32 x i1> %1 to <32 x i8>
1802  %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2)
1803  ret i8 %3
1804}
1805
1806define i8 @test_v64i8_v64i1(<64 x i8> %a0) {
1807; SSE2-LABEL: test_v64i8_v64i1:
1808; SSE2:       # %bb.0:
1809; SSE2-NEXT:    pxor %xmm4, %xmm4
1810; SSE2-NEXT:    pxor %xmm5, %xmm5
1811; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
1812; SSE2-NEXT:    pxor %xmm2, %xmm2
1813; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
1814; SSE2-NEXT:    paddb %xmm5, %xmm2
1815; SSE2-NEXT:    pxor %xmm0, %xmm0
1816; SSE2-NEXT:    pcmpgtb %xmm3, %xmm0
1817; SSE2-NEXT:    pxor %xmm3, %xmm3
1818; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
1819; SSE2-NEXT:    paddb %xmm0, %xmm3
1820; SSE2-NEXT:    paddb %xmm2, %xmm3
1821; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1822; SSE2-NEXT:    paddb %xmm3, %xmm0
1823; SSE2-NEXT:    psadbw %xmm4, %xmm0
1824; SSE2-NEXT:    movd %xmm0, %eax
1825; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1826; SSE2-NEXT:    retq
1827;
1828; SSE41-LABEL: test_v64i8_v64i1:
1829; SSE41:       # %bb.0:
1830; SSE41-NEXT:    pxor %xmm4, %xmm4
1831; SSE41-NEXT:    pxor %xmm5, %xmm5
1832; SSE41-NEXT:    pcmpgtb %xmm2, %xmm5
1833; SSE41-NEXT:    pxor %xmm2, %xmm2
1834; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
1835; SSE41-NEXT:    paddb %xmm5, %xmm2
1836; SSE41-NEXT:    pxor %xmm0, %xmm0
1837; SSE41-NEXT:    pcmpgtb %xmm3, %xmm0
1838; SSE41-NEXT:    pxor %xmm3, %xmm3
1839; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
1840; SSE41-NEXT:    paddb %xmm0, %xmm3
1841; SSE41-NEXT:    paddb %xmm2, %xmm3
1842; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1843; SSE41-NEXT:    paddb %xmm3, %xmm0
1844; SSE41-NEXT:    psadbw %xmm4, %xmm0
1845; SSE41-NEXT:    movd %xmm0, %eax
1846; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1847; SSE41-NEXT:    retq
1848;
1849; AVX1-LABEL: test_v64i8_v64i1:
1850; AVX1:       # %bb.0:
1851; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1852; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
1853; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm4
1854; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
1855; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1856; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
1857; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1858; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
1859; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1860; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
1861; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1862; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1863; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
1864; AVX1-NEXT:    vmovd %xmm0, %eax
1865; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1866; AVX1-NEXT:    vzeroupper
1867; AVX1-NEXT:    retq
1868;
1869; AVX2-LABEL: test_v64i8_v64i1:
1870; AVX2:       # %bb.0:
1871; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1872; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
1873; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
1874; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1875; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1876; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1877; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1878; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1879; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1880; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1881; AVX2-NEXT:    vmovd %xmm0, %eax
1882; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1883; AVX2-NEXT:    vzeroupper
1884; AVX2-NEXT:    retq
1885;
1886; AVX512-LABEL: test_v64i8_v64i1:
1887; AVX512:       # %bb.0:
1888; AVX512-NEXT:    vpmovb2m %zmm0, %k0
1889; AVX512-NEXT:    vpmovm2b %k0, %zmm0
1890; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1891; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1892; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1893; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1894; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1895; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1896; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1897; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1898; AVX512-NEXT:    vmovd %xmm0, %eax
1899; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1900; AVX512-NEXT:    vzeroupper
1901; AVX512-NEXT:    retq
1902  %1 = icmp slt <64 x i8> %a0, zeroinitializer
1903  %2 = sext <64 x i1> %1 to <64 x i8>
1904  %3 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %2)
1905  ret i8 %3
1906}
1907
1908define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
1909; SSE2-LABEL: test_v128i8_v128i1:
1910; SSE2:       # %bb.0:
1911; SSE2-NEXT:    pxor %xmm8, %xmm8
1912; SSE2-NEXT:    pxor %xmm9, %xmm9
1913; SSE2-NEXT:    pcmpgtb %xmm4, %xmm9
1914; SSE2-NEXT:    pxor %xmm4, %xmm4
1915; SSE2-NEXT:    pcmpgtb %xmm0, %xmm4
1916; SSE2-NEXT:    paddb %xmm9, %xmm4
1917; SSE2-NEXT:    pxor %xmm0, %xmm0
1918; SSE2-NEXT:    pcmpgtb %xmm6, %xmm0
1919; SSE2-NEXT:    pxor %xmm6, %xmm6
1920; SSE2-NEXT:    pcmpgtb %xmm2, %xmm6
1921; SSE2-NEXT:    paddb %xmm0, %xmm6
1922; SSE2-NEXT:    paddb %xmm4, %xmm6
1923; SSE2-NEXT:    pxor %xmm0, %xmm0
1924; SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
1925; SSE2-NEXT:    pxor %xmm2, %xmm2
1926; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
1927; SSE2-NEXT:    paddb %xmm0, %xmm2
1928; SSE2-NEXT:    pxor %xmm0, %xmm0
1929; SSE2-NEXT:    pcmpgtb %xmm7, %xmm0
1930; SSE2-NEXT:    pxor %xmm1, %xmm1
1931; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
1932; SSE2-NEXT:    paddb %xmm0, %xmm1
1933; SSE2-NEXT:    paddb %xmm2, %xmm1
1934; SSE2-NEXT:    paddb %xmm6, %xmm1
1935; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1936; SSE2-NEXT:    paddb %xmm1, %xmm0
1937; SSE2-NEXT:    psadbw %xmm8, %xmm0
1938; SSE2-NEXT:    movd %xmm0, %eax
1939; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1940; SSE2-NEXT:    retq
1941;
1942; SSE41-LABEL: test_v128i8_v128i1:
1943; SSE41:       # %bb.0:
1944; SSE41-NEXT:    pxor %xmm8, %xmm8
1945; SSE41-NEXT:    pxor %xmm9, %xmm9
1946; SSE41-NEXT:    pcmpgtb %xmm4, %xmm9
1947; SSE41-NEXT:    pxor %xmm4, %xmm4
1948; SSE41-NEXT:    pcmpgtb %xmm0, %xmm4
1949; SSE41-NEXT:    paddb %xmm9, %xmm4
1950; SSE41-NEXT:    pxor %xmm0, %xmm0
1951; SSE41-NEXT:    pcmpgtb %xmm6, %xmm0
1952; SSE41-NEXT:    pxor %xmm6, %xmm6
1953; SSE41-NEXT:    pcmpgtb %xmm2, %xmm6
1954; SSE41-NEXT:    paddb %xmm0, %xmm6
1955; SSE41-NEXT:    paddb %xmm4, %xmm6
1956; SSE41-NEXT:    pxor %xmm0, %xmm0
1957; SSE41-NEXT:    pcmpgtb %xmm5, %xmm0
1958; SSE41-NEXT:    pxor %xmm2, %xmm2
1959; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
1960; SSE41-NEXT:    paddb %xmm0, %xmm2
1961; SSE41-NEXT:    pxor %xmm0, %xmm0
1962; SSE41-NEXT:    pcmpgtb %xmm7, %xmm0
1963; SSE41-NEXT:    pxor %xmm1, %xmm1
1964; SSE41-NEXT:    pcmpgtb %xmm3, %xmm1
1965; SSE41-NEXT:    paddb %xmm0, %xmm1
1966; SSE41-NEXT:    paddb %xmm2, %xmm1
1967; SSE41-NEXT:    paddb %xmm6, %xmm1
1968; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1969; SSE41-NEXT:    paddb %xmm1, %xmm0
1970; SSE41-NEXT:    psadbw %xmm8, %xmm0
1971; SSE41-NEXT:    movd %xmm0, %eax
1972; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1973; SSE41-NEXT:    retq
1974;
1975; AVX1-LABEL: test_v128i8_v128i1:
1976; AVX1:       # %bb.0:
1977; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1978; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm5
1979; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm4, %xmm6
1980; AVX1-NEXT:    vpaddb %xmm5, %xmm6, %xmm5
1981; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm6
1982; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm4, %xmm7
1983; AVX1-NEXT:    vpaddb %xmm6, %xmm7, %xmm6
1984; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
1985; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1986; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm2
1987; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1988; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm4, %xmm0
1989; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1990; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
1991; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm2
1992; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1993; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm4, %xmm1
1994; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
1995; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1996; AVX1-NEXT:    vpaddb %xmm0, %xmm5, %xmm0
1997; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1998; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1999; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
2000; AVX1-NEXT:    vmovd %xmm0, %eax
2001; AVX1-NEXT:    # kill: def $al killed $al killed $eax
2002; AVX1-NEXT:    vzeroupper
2003; AVX1-NEXT:    retq
2004;
2005; AVX2-LABEL: test_v128i8_v128i1:
2006; AVX2:       # %bb.0:
2007; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2008; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm4, %ymm2
2009; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm4, %ymm0
2010; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
2011; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm4, %ymm2
2012; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm4, %ymm1
2013; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
2014; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
2015; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2016; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2017; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2018; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2019; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2020; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
2021; AVX2-NEXT:    vmovd %xmm0, %eax
2022; AVX2-NEXT:    # kill: def $al killed $al killed $eax
2023; AVX2-NEXT:    vzeroupper
2024; AVX2-NEXT:    retq
2025;
2026; AVX512-LABEL: test_v128i8_v128i1:
2027; AVX512:       # %bb.0:
2028; AVX512-NEXT:    vpmovb2m %zmm0, %k0
2029; AVX512-NEXT:    vpmovb2m %zmm1, %k1
2030; AVX512-NEXT:    vpmovm2b %k1, %zmm0
2031; AVX512-NEXT:    vpmovm2b %k0, %zmm1
2032; AVX512-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
2033; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2034; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
2035; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
2036; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2037; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2038; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
2039; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2040; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
2041; AVX512-NEXT:    vmovd %xmm0, %eax
2042; AVX512-NEXT:    # kill: def $al killed $al killed $eax
2043; AVX512-NEXT:    vzeroupper
2044; AVX512-NEXT:    retq
2045  %1 = icmp slt <128 x i8> %a0, zeroinitializer
2046  %2 = sext <128 x i1> %1 to <128 x i8>
2047  %3 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %2)
2048  ret i8 %3
2049}
2050
2051declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
2052declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
2053declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
2054declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
2055
2056declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
2057declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2058declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
2059declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
2060declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
2061
2062declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
2063declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
2064declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
2065declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
2066declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
2067declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
2068
2069declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
2070declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
2071declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
2072declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
2073declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
2074declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
2075declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
2076