xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll (revision 86eff6be686a1e41e13c08ebfc2db4dd4d58e7c6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
9
10;
11; vXi64
12;
13
14define i64 @test_v2i64_v2i32(<2 x i32> %a0) {
15; SSE2-LABEL: test_v2i64_v2i32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    xorps %xmm1, %xmm1
18; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
19; SSE2-NEXT:    psrlq $32, %xmm0
20; SSE2-NEXT:    paddq %xmm1, %xmm0
21; SSE2-NEXT:    movq %xmm0, %rax
22; SSE2-NEXT:    retq
23;
24; SSE41-LABEL: test_v2i64_v2i32:
25; SSE41:       # %bb.0:
26; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
27; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
28; SSE41-NEXT:    paddq %xmm0, %xmm1
29; SSE41-NEXT:    movq %xmm1, %rax
30; SSE41-NEXT:    retq
31;
32; AVX-LABEL: test_v2i64_v2i32:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
35; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
36; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
37; AVX-NEXT:    vmovq %xmm0, %rax
38; AVX-NEXT:    retq
39  %1 = zext <2 x i32> %a0 to <2 x i64>
40  %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1)
41  ret i64 %2
42}
43
44define i64 @test_v4i64_v4i16(<4 x i16> %a0) {
45; SSE2-LABEL: test_v4i64_v4i16:
46; SSE2:       # %bb.0:
47; SSE2-NEXT:    pxor %xmm1, %xmm1
48; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
49; SSE2-NEXT:    movdqa %xmm0, %xmm2
50; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
51; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
52; SSE2-NEXT:    paddq %xmm2, %xmm0
53; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
54; SSE2-NEXT:    paddq %xmm0, %xmm1
55; SSE2-NEXT:    movq %xmm1, %rax
56; SSE2-NEXT:    retq
57;
58; SSE41-LABEL: test_v4i64_v4i16:
59; SSE41:       # %bb.0:
60; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
61; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
62; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
63; SSE41-NEXT:    paddq %xmm1, %xmm0
64; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
65; SSE41-NEXT:    paddq %xmm0, %xmm1
66; SSE41-NEXT:    movq %xmm1, %rax
67; SSE41-NEXT:    retq
68;
69; AVX1-LABEL: test_v4i64_v4i16:
70; AVX1:       # %bb.0:
71; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
72; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
73; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
74; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
75; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
76; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
77; AVX1-NEXT:    vmovq %xmm0, %rax
78; AVX1-NEXT:    retq
79;
80; AVX2-LABEL: test_v4i64_v4i16:
81; AVX2:       # %bb.0:
82; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
83; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
84; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
85; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
86; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
87; AVX2-NEXT:    vmovq %xmm0, %rax
88; AVX2-NEXT:    vzeroupper
89; AVX2-NEXT:    retq
90;
91; AVX512-LABEL: test_v4i64_v4i16:
92; AVX512:       # %bb.0:
93; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
94; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
95; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
96; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
97; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
98; AVX512-NEXT:    vmovq %xmm0, %rax
99; AVX512-NEXT:    vzeroupper
100; AVX512-NEXT:    retq
101  %1 = zext <4 x i16> %a0 to <4 x i64>
102  %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1)
103  ret i64 %2
104}
105
106define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
107; SSE-LABEL: test_v8i64_v8i8:
108; SSE:       # %bb.0:
109; SSE-NEXT:    pxor %xmm1, %xmm1
110; SSE-NEXT:    psadbw %xmm0, %xmm1
111; SSE-NEXT:    movq %xmm1, %rax
112; SSE-NEXT:    retq
113;
114; AVX-LABEL: test_v8i64_v8i8:
115; AVX:       # %bb.0:
116; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
117; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
118; AVX-NEXT:    vmovq %xmm0, %rax
119; AVX-NEXT:    retq
120  %1 = zext <8 x i8> %a0 to <8 x i64>
121  %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1)
122  ret i64 %2
123}
124
125define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
126; SSE-LABEL: test_v16i64_v16i8:
127; SSE:       # %bb.0:
128; SSE-NEXT:    pxor %xmm1, %xmm1
129; SSE-NEXT:    psadbw %xmm0, %xmm1
130; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
131; SSE-NEXT:    paddq %xmm1, %xmm0
132; SSE-NEXT:    movq %xmm0, %rax
133; SSE-NEXT:    retq
134;
135; AVX-LABEL: test_v16i64_v16i8:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
138; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
139; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
140; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
141; AVX-NEXT:    vmovq %xmm0, %rax
142; AVX-NEXT:    retq
143  %1 = zext <16 x i8> %a0 to <16 x i64>
144  %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1)
145  ret i64 %2
146}
147
148;
149; vXi32
150;
151
152define i32 @test_v2i32_v2i16(<2 x i16> %a0) {
153; SSE2-LABEL: test_v2i32_v2i16:
154; SSE2:       # %bb.0:
155; SSE2-NEXT:    pxor %xmm1, %xmm1
156; SSE2-NEXT:    movdqa %xmm0, %xmm2
157; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
158; SSE2-NEXT:    psrld $16, %xmm0
159; SSE2-NEXT:    paddd %xmm2, %xmm0
160; SSE2-NEXT:    movd %xmm0, %eax
161; SSE2-NEXT:    retq
162;
163; SSE41-LABEL: test_v2i32_v2i16:
164; SSE41:       # %bb.0:
165; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
166; SSE41-NEXT:    psrld $16, %xmm0
167; SSE41-NEXT:    paddd %xmm1, %xmm0
168; SSE41-NEXT:    movd %xmm0, %eax
169; SSE41-NEXT:    retq
170;
171; AVX1-SLOW-LABEL: test_v2i32_v2i16:
172; AVX1-SLOW:       # %bb.0:
173; AVX1-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
174; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
175; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
176; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
177; AVX1-SLOW-NEXT:    retq
178;
179; AVX1-FAST-LABEL: test_v2i32_v2i16:
180; AVX1-FAST:       # %bb.0:
181; AVX1-FAST-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
182; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
183; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
184; AVX1-FAST-NEXT:    retq
185;
186; AVX2-LABEL: test_v2i32_v2i16:
187; AVX2:       # %bb.0:
188; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
189; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
190; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
191; AVX2-NEXT:    vmovd %xmm0, %eax
192; AVX2-NEXT:    retq
193;
194; AVX512-LABEL: test_v2i32_v2i16:
195; AVX512:       # %bb.0:
196; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
197; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
198; AVX512-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
199; AVX512-NEXT:    vmovd %xmm0, %eax
200; AVX512-NEXT:    retq
201  %1 = zext <2 x i16> %a0 to <2 x i32>
202  %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1)
203  ret i32 %2
204}
205
206define i32 @test_v4i32(<4 x i8> %a0) {
207; SSE2-LABEL: test_v4i32:
208; SSE2:       # %bb.0:
209; SSE2-NEXT:    pxor %xmm1, %xmm1
210; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211; SSE2-NEXT:    psadbw %xmm1, %xmm0
212; SSE2-NEXT:    movd %xmm0, %eax
213; SSE2-NEXT:    retq
214;
215; SSE41-LABEL: test_v4i32:
216; SSE41:       # %bb.0:
217; SSE41-NEXT:    pxor %xmm1, %xmm1
218; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
219; SSE41-NEXT:    psadbw %xmm1, %xmm0
220; SSE41-NEXT:    movd %xmm0, %eax
221; SSE41-NEXT:    retq
222;
223; AVX1-LABEL: test_v4i32:
224; AVX1:       # %bb.0:
225; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
226; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
227; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
228; AVX1-NEXT:    vmovd %xmm0, %eax
229; AVX1-NEXT:    retq
230;
231; AVX2-LABEL: test_v4i32:
232; AVX2:       # %bb.0:
233; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
234; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
235; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
236; AVX2-NEXT:    vmovd %xmm0, %eax
237; AVX2-NEXT:    retq
238;
239; AVX512-LABEL: test_v4i32:
240; AVX512:       # %bb.0:
241; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
242; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
243; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
244; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
245; AVX512-NEXT:    vmovd %xmm0, %eax
246; AVX512-NEXT:    retq
247  %1 = zext <4 x i8> %a0 to <4 x i32>
248  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
249  ret i32 %2
250}
251
252define i32 @test_v8i32_v8i8(<8 x i8> %a0) {
253; SSE-LABEL: test_v8i32_v8i8:
254; SSE:       # %bb.0:
255; SSE-NEXT:    pxor %xmm1, %xmm1
256; SSE-NEXT:    psadbw %xmm0, %xmm1
257; SSE-NEXT:    movd %xmm1, %eax
258; SSE-NEXT:    retq
259;
260; AVX-LABEL: test_v8i32_v8i8:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
263; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
264; AVX-NEXT:    vmovd %xmm0, %eax
265; AVX-NEXT:    retq
266  %1 = zext <8 x i8> %a0 to <8 x i32>
267  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
268  ret i32 %2
269}
270
271define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
272; SSE-LABEL: test_v16i32_v16i8:
273; SSE:       # %bb.0:
274; SSE-NEXT:    pxor %xmm1, %xmm1
275; SSE-NEXT:    psadbw %xmm0, %xmm1
276; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
277; SSE-NEXT:    paddq %xmm1, %xmm0
278; SSE-NEXT:    movd %xmm0, %eax
279; SSE-NEXT:    retq
280;
281; AVX-LABEL: test_v16i32_v16i8:
282; AVX:       # %bb.0:
283; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
284; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
285; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
286; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
287; AVX-NEXT:    vmovd %xmm0, %eax
288; AVX-NEXT:    retq
289  %1 = zext <16 x i8> %a0 to <16 x i32>
290  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
291  ret i32 %2
292}
293
294define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
295; SSE-LABEL: test_v32i32_v32i8:
296; SSE:       # %bb.0:
297; SSE-NEXT:    pxor %xmm2, %xmm2
298; SSE-NEXT:    psadbw %xmm2, %xmm1
299; SSE-NEXT:    psadbw %xmm2, %xmm0
300; SSE-NEXT:    paddq %xmm1, %xmm0
301; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
302; SSE-NEXT:    paddq %xmm0, %xmm1
303; SSE-NEXT:    movd %xmm1, %eax
304; SSE-NEXT:    retq
305;
306; AVX1-LABEL: test_v32i32_v32i8:
307; AVX1:       # %bb.0:
308; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
309; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
310; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
311; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
312; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
313; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
314; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
315; AVX1-NEXT:    vmovd %xmm0, %eax
316; AVX1-NEXT:    vzeroupper
317; AVX1-NEXT:    retq
318;
319; AVX2-LABEL: test_v32i32_v32i8:
320; AVX2:       # %bb.0:
321; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
322; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
323; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
324; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
325; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
326; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
327; AVX2-NEXT:    vmovd %xmm0, %eax
328; AVX2-NEXT:    vzeroupper
329; AVX2-NEXT:    retq
330;
331; AVX512-LABEL: test_v32i32_v32i8:
332; AVX512:       # %bb.0:
333; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
334; AVX512-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
335; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
336; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
337; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
338; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
339; AVX512-NEXT:    vmovd %xmm0, %eax
340; AVX512-NEXT:    vzeroupper
341; AVX512-NEXT:    retq
342  %1 = zext <32 x i8> %a0 to <32 x i32>
343  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
344  ret i32 %2
345}
346
347;
348; vXi16
349;
350
351define i16 @test_v2i16_v2i8(<2 x i8> %a0) {
352; SSE2-LABEL: test_v2i16_v2i8:
353; SSE2:       # %bb.0:
354; SSE2-NEXT:    pxor %xmm1, %xmm1
355; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
356; SSE2-NEXT:    movdqa %xmm0, %xmm1
357; SSE2-NEXT:    psrld $16, %xmm1
358; SSE2-NEXT:    paddw %xmm0, %xmm1
359; SSE2-NEXT:    movd %xmm1, %eax
360; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
361; SSE2-NEXT:    retq
362;
363; SSE41-LABEL: test_v2i16_v2i8:
364; SSE41:       # %bb.0:
365; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
366; SSE41-NEXT:    movdqa %xmm0, %xmm1
367; SSE41-NEXT:    psrld $16, %xmm1
368; SSE41-NEXT:    paddw %xmm0, %xmm1
369; SSE41-NEXT:    movd %xmm1, %eax
370; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
371; SSE41-NEXT:    retq
372;
373; AVX1-SLOW-LABEL: test_v2i16_v2i8:
374; AVX1-SLOW:       # %bb.0:
375; AVX1-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
376; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
377; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
378; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
379; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
380; AVX1-SLOW-NEXT:    retq
381;
382; AVX1-FAST-LABEL: test_v2i16_v2i8:
383; AVX1-FAST:       # %bb.0:
384; AVX1-FAST-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
385; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
386; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
387; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
388; AVX1-FAST-NEXT:    retq
389;
390; AVX2-LABEL: test_v2i16_v2i8:
391; AVX2:       # %bb.0:
392; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
393; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
394; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
395; AVX2-NEXT:    vmovd %xmm0, %eax
396; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
397; AVX2-NEXT:    retq
398;
399; AVX512-LABEL: test_v2i16_v2i8:
400; AVX512:       # %bb.0:
401; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
402; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
403; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
404; AVX512-NEXT:    vmovd %xmm0, %eax
405; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
406; AVX512-NEXT:    retq
407  %1 = zext <2 x i8> %a0 to <2 x i16>
408  %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1)
409  ret i16 %2
410}
411
412define i16 @test_v4i16_v4i8(<4 x i8> %a0) {
413; SSE2-LABEL: test_v4i16_v4i8:
414; SSE2:       # %bb.0:
415; SSE2-NEXT:    pxor %xmm1, %xmm1
416; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
417; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
418; SSE2-NEXT:    paddw %xmm0, %xmm1
419; SSE2-NEXT:    movdqa %xmm1, %xmm0
420; SSE2-NEXT:    psrld $16, %xmm0
421; SSE2-NEXT:    paddw %xmm1, %xmm0
422; SSE2-NEXT:    movd %xmm0, %eax
423; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
424; SSE2-NEXT:    retq
425;
426; SSE41-LABEL: test_v4i16_v4i8:
427; SSE41:       # %bb.0:
428; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
429; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
430; SSE41-NEXT:    paddw %xmm0, %xmm1
431; SSE41-NEXT:    movdqa %xmm1, %xmm0
432; SSE41-NEXT:    psrld $16, %xmm0
433; SSE41-NEXT:    paddw %xmm1, %xmm0
434; SSE41-NEXT:    movd %xmm0, %eax
435; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
436; SSE41-NEXT:    retq
437;
438; AVX1-SLOW-LABEL: test_v4i16_v4i8:
439; AVX1-SLOW:       # %bb.0:
440; AVX1-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
441; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
442; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
443; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
444; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
445; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
446; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
447; AVX1-SLOW-NEXT:    retq
448;
449; AVX1-FAST-LABEL: test_v4i16_v4i8:
450; AVX1-FAST:       # %bb.0:
451; AVX1-FAST-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
452; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
453; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
454; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
455; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
456; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
457; AVX1-FAST-NEXT:    retq
458;
459; AVX2-LABEL: test_v4i16_v4i8:
460; AVX2:       # %bb.0:
461; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
462; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
463; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
464; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
465; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
466; AVX2-NEXT:    vmovd %xmm0, %eax
467; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
468; AVX2-NEXT:    retq
469;
470; AVX512-LABEL: test_v4i16_v4i8:
471; AVX512:       # %bb.0:
472; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
473; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
474; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
475; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
476; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
477; AVX512-NEXT:    vmovd %xmm0, %eax
478; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
479; AVX512-NEXT:    retq
480  %1 = zext <4 x i8> %a0 to <4 x i16>
481  %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
482  ret i16 %2
483
484}
485
486define i16 @test_v8i16_v8i8(<8 x i8> %a0) {
487; SSE-LABEL: test_v8i16_v8i8:
488; SSE:       # %bb.0:
489; SSE-NEXT:    pxor %xmm1, %xmm1
490; SSE-NEXT:    psadbw %xmm0, %xmm1
491; SSE-NEXT:    movd %xmm1, %eax
492; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
493; SSE-NEXT:    retq
494;
495; AVX-LABEL: test_v8i16_v8i8:
496; AVX:       # %bb.0:
497; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
498; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
499; AVX-NEXT:    vmovd %xmm0, %eax
500; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
501; AVX-NEXT:    retq
502  %1 = zext <8 x i8> %a0 to <8 x i16>
503  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
504  ret i16 %2
505}
506
507define i16 @test_v16i16_v16i8(<16 x i8> %a0) {
508; SSE-LABEL: test_v16i16_v16i8:
509; SSE:       # %bb.0:
510; SSE-NEXT:    pxor %xmm1, %xmm1
511; SSE-NEXT:    psadbw %xmm0, %xmm1
512; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
513; SSE-NEXT:    paddq %xmm1, %xmm0
514; SSE-NEXT:    movd %xmm0, %eax
515; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
516; SSE-NEXT:    retq
517;
518; AVX-LABEL: test_v16i16_v16i8:
519; AVX:       # %bb.0:
520; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
521; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
522; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
523; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
524; AVX-NEXT:    vmovd %xmm0, %eax
525; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
526; AVX-NEXT:    retq
527  %1 = zext <16 x i8> %a0 to <16 x i16>
528  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
529  ret i16 %2
530}
531
532define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
533; SSE-LABEL: test_v32i16_v32i8:
534; SSE:       # %bb.0:
535; SSE-NEXT:    pxor %xmm2, %xmm2
536; SSE-NEXT:    psadbw %xmm2, %xmm1
537; SSE-NEXT:    psadbw %xmm2, %xmm0
538; SSE-NEXT:    paddq %xmm1, %xmm0
539; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
540; SSE-NEXT:    paddq %xmm0, %xmm1
541; SSE-NEXT:    movd %xmm1, %eax
542; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
543; SSE-NEXT:    retq
544;
545; AVX1-LABEL: test_v32i16_v32i8:
546; AVX1:       # %bb.0:
547; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
548; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
549; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
550; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
551; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
552; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
553; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
554; AVX1-NEXT:    vmovd %xmm0, %eax
555; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
556; AVX1-NEXT:    vzeroupper
557; AVX1-NEXT:    retq
558;
559; AVX2-LABEL: test_v32i16_v32i8:
560; AVX2:       # %bb.0:
561; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
562; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
563; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
564; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
565; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
566; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
567; AVX2-NEXT:    vmovd %xmm0, %eax
568; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
569; AVX2-NEXT:    vzeroupper
570; AVX2-NEXT:    retq
571;
572; AVX512-LABEL: test_v32i16_v32i8:
573; AVX512:       # %bb.0:
574; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
575; AVX512-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
576; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
577; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
578; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
579; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
580; AVX512-NEXT:    vmovd %xmm0, %eax
581; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
582; AVX512-NEXT:    vzeroupper
583; AVX512-NEXT:    retq
584  %1 = zext <32 x i8> %a0 to <32 x i16>
585  %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
586  ret i16 %2
587}
588
589define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
590; SSE-LABEL: test_v64i16_v64i8:
591; SSE:       # %bb.0:
592; SSE-NEXT:    pxor %xmm4, %xmm4
593; SSE-NEXT:    psadbw %xmm4, %xmm3
594; SSE-NEXT:    psadbw %xmm4, %xmm1
595; SSE-NEXT:    paddq %xmm3, %xmm1
596; SSE-NEXT:    psadbw %xmm4, %xmm2
597; SSE-NEXT:    psadbw %xmm4, %xmm0
598; SSE-NEXT:    paddq %xmm2, %xmm0
599; SSE-NEXT:    paddq %xmm1, %xmm0
600; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
601; SSE-NEXT:    paddq %xmm0, %xmm1
602; SSE-NEXT:    movd %xmm1, %eax
603; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
604; SSE-NEXT:    retq
605;
606; AVX1-LABEL: test_v64i16_v64i8:
607; AVX1:       # %bb.0:
608; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
609; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
610; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
611; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
612; AVX1-NEXT:    vpsadbw %xmm3, %xmm4, %xmm4
613; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
614; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
615; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
616; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
617; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
618; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
619; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
620; AVX1-NEXT:    vmovd %xmm0, %eax
621; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
622; AVX1-NEXT:    vzeroupper
623; AVX1-NEXT:    retq
624;
625; AVX2-LABEL: test_v64i16_v64i8:
626; AVX2:       # %bb.0:
627; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
628; AVX2-NEXT:    vpsadbw %ymm2, %ymm1, %ymm1
629; AVX2-NEXT:    vpsadbw %ymm2, %ymm0, %ymm0
630; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
631; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
632; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
633; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
634; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
635; AVX2-NEXT:    vmovd %xmm0, %eax
636; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
637; AVX2-NEXT:    vzeroupper
638; AVX2-NEXT:    retq
639;
640; AVX512-LABEL: test_v64i16_v64i8:
641; AVX512:       # %bb.0:
642; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
643; AVX512-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
644; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
645; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
646; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
647; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
648; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
649; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
650; AVX512-NEXT:    vmovd %xmm0, %eax
651; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
652; AVX512-NEXT:    vzeroupper
653; AVX512-NEXT:    retq
654  %1 = zext <64 x i8> %a0 to <64 x i16>
655  %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
656  ret i16 %2
657
658}
659
660declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
661declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
662declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
663declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
664
665declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
666declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
667declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
668declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
669declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
670
671declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
672declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
673declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
674declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
675declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
676declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
677
678declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
679declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
680declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
681declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
682declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
683declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
684declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
685