xref: /llvm-project/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll (revision b222ec18653c0324a330b8ef5b783fad1f9d7637)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL
9
10;
11; vXi64
12;
13
14define i64 @test_v2i64_v2i32(<2 x i64> %a0) {
15; SSE2-LABEL: test_v2i64_v2i32:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
18; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
19; SSE2-NEXT:    paddq %xmm0, %xmm1
20; SSE2-NEXT:    movq %xmm1, %rax
21; SSE2-NEXT:    retq
22;
23; SSE41-LABEL: test_v2i64_v2i32:
24; SSE41:       # %bb.0:
25; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
26; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
27; SSE41-NEXT:    paddq %xmm0, %xmm1
28; SSE41-NEXT:    movq %xmm1, %rax
29; SSE41-NEXT:    retq
30;
31; AVX1-LABEL: test_v2i64_v2i32:
32; AVX1:       # %bb.0:
33; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
34; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
35; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
36; AVX1-NEXT:    vmovq %xmm0, %rax
37; AVX1-NEXT:    retq
38;
39; AVX2-LABEL: test_v2i64_v2i32:
40; AVX2:       # %bb.0:
41; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
42; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
43; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
44; AVX2-NEXT:    vmovq %xmm0, %rax
45; AVX2-NEXT:    retq
46;
47; AVX512BW-LABEL: test_v2i64_v2i32:
48; AVX512BW:       # %bb.0:
49; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
50; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
51; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
52; AVX512BW-NEXT:    vmovq %xmm0, %rax
53; AVX512BW-NEXT:    retq
54;
55; AVX512BWVL-LABEL: test_v2i64_v2i32:
56; AVX512BWVL:       # %bb.0:
57; AVX512BWVL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
58; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
59; AVX512BWVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
60; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
61; AVX512BWVL-NEXT:    retq
62  %1 = and <2 x i64> %a0, <i64 255, i64 255>
63  %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1)
64  ret i64 %2
65}
66
67define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
68; SSE2-LABEL: test_v4i64_v4i16:
69; SSE2:       # %bb.0:
70; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
71; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
72; SSE2-NEXT:    paddq %xmm1, %xmm0
73; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
74; SSE2-NEXT:    paddq %xmm0, %xmm1
75; SSE2-NEXT:    movq %xmm1, %rax
76; SSE2-NEXT:    retq
77;
78; SSE41-LABEL: test_v4i64_v4i16:
79; SSE41:       # %bb.0:
80; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
81; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
82; SSE41-NEXT:    paddq %xmm1, %xmm0
83; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
84; SSE41-NEXT:    paddq %xmm0, %xmm1
85; SSE41-NEXT:    movq %xmm1, %rax
86; SSE41-NEXT:    retq
87;
88; AVX1-LABEL: test_v4i64_v4i16:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
91; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
92; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
93; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
94; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
95; AVX1-NEXT:    vmovq %xmm0, %rax
96; AVX1-NEXT:    vzeroupper
97; AVX1-NEXT:    retq
98;
99; AVX2-LABEL: test_v4i64_v4i16:
100; AVX2:       # %bb.0:
101; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
102; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
103; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
104; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
105; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
106; AVX2-NEXT:    vmovq %xmm0, %rax
107; AVX2-NEXT:    vzeroupper
108; AVX2-NEXT:    retq
109;
110; AVX512BW-LABEL: test_v4i64_v4i16:
111; AVX512BW:       # %bb.0:
112; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
113; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
114; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
115; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
116; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
117; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
118; AVX512BW-NEXT:    vmovq %xmm0, %rax
119; AVX512BW-NEXT:    vzeroupper
120; AVX512BW-NEXT:    retq
121;
122; AVX512BWVL-LABEL: test_v4i64_v4i16:
123; AVX512BWVL:       # %bb.0:
124; AVX512BWVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
125; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
126; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
127; AVX512BWVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
128; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
129; AVX512BWVL-NEXT:    vzeroupper
130; AVX512BWVL-NEXT:    retq
131  %1 = and <4 x i64> %a0, <i64 15, i64 31, i64 63, i64 127>
132  %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1)
133  ret i64 %2
134}
135
136define i64 @test_v8i64_v8i8(<8 x i64> %a0) {
137; SSE2-LABEL: test_v8i64_v8i8:
138; SSE2:       # %bb.0:
139; SSE2-NEXT:    psrlq $60, %xmm2
140; SSE2-NEXT:    psrlq $60, %xmm0
141; SSE2-NEXT:    paddq %xmm2, %xmm0
142; SSE2-NEXT:    psrlq $60, %xmm3
143; SSE2-NEXT:    psrlq $60, %xmm1
144; SSE2-NEXT:    paddq %xmm3, %xmm1
145; SSE2-NEXT:    paddq %xmm0, %xmm1
146; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
147; SSE2-NEXT:    paddq %xmm1, %xmm0
148; SSE2-NEXT:    movq %xmm0, %rax
149; SSE2-NEXT:    retq
150;
151; SSE41-LABEL: test_v8i64_v8i8:
152; SSE41:       # %bb.0:
153; SSE41-NEXT:    psrlq $60, %xmm2
154; SSE41-NEXT:    psrlq $60, %xmm0
155; SSE41-NEXT:    paddq %xmm2, %xmm0
156; SSE41-NEXT:    psrlq $60, %xmm3
157; SSE41-NEXT:    psrlq $60, %xmm1
158; SSE41-NEXT:    paddq %xmm3, %xmm1
159; SSE41-NEXT:    paddq %xmm0, %xmm1
160; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
161; SSE41-NEXT:    paddq %xmm1, %xmm0
162; SSE41-NEXT:    movq %xmm0, %rax
163; SSE41-NEXT:    retq
164;
165; AVX1-LABEL: test_v8i64_v8i8:
166; AVX1:       # %bb.0:
167; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm2
168; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm3
169; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
170; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
171; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
172; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
173; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
174; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
175; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
176; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
177; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
178; AVX1-NEXT:    vmovq %xmm0, %rax
179; AVX1-NEXT:    vzeroupper
180; AVX1-NEXT:    retq
181;
182; AVX2-LABEL: test_v8i64_v8i8:
183; AVX2:       # %bb.0:
184; AVX2-NEXT:    vpsrlq $60, %ymm1, %ymm1
185; AVX2-NEXT:    vpsrlq $60, %ymm0, %ymm0
186; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
187; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
188; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
189; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
190; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
191; AVX2-NEXT:    vmovq %xmm0, %rax
192; AVX2-NEXT:    vzeroupper
193; AVX2-NEXT:    retq
194;
195; AVX512-LABEL: test_v8i64_v8i8:
196; AVX512:       # %bb.0:
197; AVX512-NEXT:    vpsrlq $60, %zmm0, %zmm0
198; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
199; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
200; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
201; AVX512-NEXT:    vmovq %xmm0, %rax
202; AVX512-NEXT:    vzeroupper
203; AVX512-NEXT:    retq
204  %1 = lshr <8 x i64> %a0, <i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60>
205  %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1)
206  ret i64 %2
207}
208
209define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
210; SSE2-LABEL: test_v16i64_v16i8:
211; SSE2:       # %bb.0:
212; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [1,1]
213; SSE2-NEXT:    pand %xmm8, %xmm5
214; SSE2-NEXT:    pand %xmm8, %xmm1
215; SSE2-NEXT:    paddq %xmm5, %xmm1
216; SSE2-NEXT:    pand %xmm8, %xmm7
217; SSE2-NEXT:    pand %xmm8, %xmm3
218; SSE2-NEXT:    paddq %xmm7, %xmm3
219; SSE2-NEXT:    paddq %xmm1, %xmm3
220; SSE2-NEXT:    pand %xmm8, %xmm4
221; SSE2-NEXT:    pand %xmm8, %xmm0
222; SSE2-NEXT:    paddq %xmm4, %xmm0
223; SSE2-NEXT:    pand %xmm8, %xmm6
224; SSE2-NEXT:    pand %xmm8, %xmm2
225; SSE2-NEXT:    paddq %xmm6, %xmm2
226; SSE2-NEXT:    paddq %xmm0, %xmm2
227; SSE2-NEXT:    paddq %xmm3, %xmm2
228; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
229; SSE2-NEXT:    paddq %xmm2, %xmm0
230; SSE2-NEXT:    movq %xmm0, %rax
231; SSE2-NEXT:    retq
232;
233; SSE41-LABEL: test_v16i64_v16i8:
234; SSE41:       # %bb.0:
235; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm8 = [1,1]
236; SSE41-NEXT:    pand %xmm8, %xmm5
237; SSE41-NEXT:    pand %xmm8, %xmm1
238; SSE41-NEXT:    paddq %xmm5, %xmm1
239; SSE41-NEXT:    pand %xmm8, %xmm7
240; SSE41-NEXT:    pand %xmm8, %xmm3
241; SSE41-NEXT:    paddq %xmm7, %xmm3
242; SSE41-NEXT:    paddq %xmm1, %xmm3
243; SSE41-NEXT:    pand %xmm8, %xmm4
244; SSE41-NEXT:    pand %xmm8, %xmm0
245; SSE41-NEXT:    paddq %xmm4, %xmm0
246; SSE41-NEXT:    pand %xmm8, %xmm6
247; SSE41-NEXT:    pand %xmm8, %xmm2
248; SSE41-NEXT:    paddq %xmm6, %xmm2
249; SSE41-NEXT:    paddq %xmm0, %xmm2
250; SSE41-NEXT:    paddq %xmm3, %xmm2
251; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
252; SSE41-NEXT:    paddq %xmm2, %xmm0
253; SSE41-NEXT:    movq %xmm0, %rax
254; SSE41-NEXT:    retq
255;
256; AVX1-LABEL: test_v16i64_v16i8:
257; AVX1:       # %bb.0:
258; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1]
259; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
260; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
261; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
262; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
263; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
264; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm5
265; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
266; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
267; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
268; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
269; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
270; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
271; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
272; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
273; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
274; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
275; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
276; AVX1-NEXT:    vmovq %xmm0, %rax
277; AVX1-NEXT:    vzeroupper
278; AVX1-NEXT:    retq
279;
280; AVX2-LABEL: test_v16i64_v16i8:
281; AVX2:       # %bb.0:
282; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1]
283; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
284; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
285; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
286; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
287; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
288; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
289; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
290; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
291; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
292; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
293; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
294; AVX2-NEXT:    vmovq %xmm0, %rax
295; AVX2-NEXT:    vzeroupper
296; AVX2-NEXT:    retq
297;
298; AVX512BW-LABEL: test_v16i64_v16i8:
299; AVX512BW:       # %bb.0:
300; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
301; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
302; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
303; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
304; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
305; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
306; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
307; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
308; AVX512BW-NEXT:    vmovq %xmm0, %rax
309; AVX512BW-NEXT:    vzeroupper
310; AVX512BW-NEXT:    retq
311;
312; AVX512BWVL-LABEL: test_v16i64_v16i8:
313; AVX512BWVL:       # %bb.0:
314; AVX512BWVL-NEXT:    vpmovqb %zmm1, %xmm1
315; AVX512BWVL-NEXT:    vpmovqb %zmm0, %xmm0
316; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
317; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
318; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
319; AVX512BWVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
320; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
321; AVX512BWVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
322; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
323; AVX512BWVL-NEXT:    vzeroupper
324; AVX512BWVL-NEXT:    retq
325  %1 = and <16 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
326  %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1)
327  ret i64 %2
328}
329
330;
331; vXi32
332;
333
334define i32 @test_v2i32_v2i16(<2 x i32> %a0) {
335; SSE2-LABEL: test_v2i32_v2i16:
336; SSE2:       # %bb.0:
337; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
338; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
339; SSE2-NEXT:    paddd %xmm0, %xmm1
340; SSE2-NEXT:    movd %xmm1, %eax
341; SSE2-NEXT:    retq
342;
343; SSE41-LABEL: test_v2i32_v2i16:
344; SSE41:       # %bb.0:
345; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
346; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
347; SSE41-NEXT:    paddd %xmm0, %xmm1
348; SSE41-NEXT:    movd %xmm1, %eax
349; SSE41-NEXT:    retq
350;
351; AVX1-SLOW-LABEL: test_v2i32_v2i16:
352; AVX1-SLOW:       # %bb.0:
353; AVX1-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
354; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
355; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
356; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
357; AVX1-SLOW-NEXT:    retq
358;
359; AVX1-FAST-LABEL: test_v2i32_v2i16:
360; AVX1-FAST:       # %bb.0:
361; AVX1-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
362; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
363; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
364; AVX1-FAST-NEXT:    retq
365;
366; AVX2-LABEL: test_v2i32_v2i16:
367; AVX2:       # %bb.0:
368; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
369; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
370; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
371; AVX2-NEXT:    vmovd %xmm0, %eax
372; AVX2-NEXT:    retq
373;
374; AVX512-LABEL: test_v2i32_v2i16:
375; AVX512:       # %bb.0:
376; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
377; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
378; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
379; AVX512-NEXT:    vmovd %xmm0, %eax
380; AVX512-NEXT:    retq
381  %1 = and <2 x i32> %a0, <i32 255, i32 255>
382  %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1)
383  ret i32 %2
384}
385
386define i32 @test_v4i32(<4 x i32> %a0) {
387; SSE2-LABEL: test_v4i32:
388; SSE2:       # %bb.0:
389; SSE2-NEXT:    psrld $31, %xmm0
390; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
391; SSE2-NEXT:    paddd %xmm0, %xmm1
392; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
393; SSE2-NEXT:    paddd %xmm1, %xmm0
394; SSE2-NEXT:    movd %xmm0, %eax
395; SSE2-NEXT:    retq
396;
397; SSE41-LABEL: test_v4i32:
398; SSE41:       # %bb.0:
399; SSE41-NEXT:    psrld $31, %xmm0
400; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
401; SSE41-NEXT:    paddd %xmm0, %xmm1
402; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
403; SSE41-NEXT:    paddd %xmm1, %xmm0
404; SSE41-NEXT:    movd %xmm0, %eax
405; SSE41-NEXT:    retq
406;
407; AVX1-SLOW-LABEL: test_v4i32:
408; AVX1-SLOW:       # %bb.0:
409; AVX1-SLOW-NEXT:    vpsrld $31, %xmm0, %xmm0
410; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
411; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
412; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
413; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
414; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
415; AVX1-SLOW-NEXT:    retq
416;
417; AVX1-FAST-LABEL: test_v4i32:
418; AVX1-FAST:       # %bb.0:
419; AVX1-FAST-NEXT:    vpsrld $31, %xmm0, %xmm0
420; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
421; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
422; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
423; AVX1-FAST-NEXT:    retq
424;
425; AVX2-LABEL: test_v4i32:
426; AVX2:       # %bb.0:
427; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
428; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
429; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
430; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
431; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
432; AVX2-NEXT:    vmovd %xmm0, %eax
433; AVX2-NEXT:    retq
434;
435; AVX512BW-LABEL: test_v4i32:
436; AVX512BW:       # %bb.0:
437; AVX512BW-NEXT:    vpsrld $31, %xmm0, %xmm0
438; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
439; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
440; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
441; AVX512BW-NEXT:    vmovd %xmm0, %eax
442; AVX512BW-NEXT:    retq
443;
444; AVX512BWVL-LABEL: test_v4i32:
445; AVX512BWVL:       # %bb.0:
446; AVX512BWVL-NEXT:    vpsrld $31, %xmm0, %xmm0
447; AVX512BWVL-NEXT:    vpmovdb %xmm0, %xmm0
448; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
449; AVX512BWVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
450; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
451; AVX512BWVL-NEXT:    retq
452  %1 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
453  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
454  ret i32 %2
455}
456
457define i32 @test_v8i32_v8i8(<8 x i32> %a0) {
458; SSE2-LABEL: test_v8i32_v8i8:
459; SSE2:       # %bb.0:
460; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
461; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
462; SSE2-NEXT:    por %xmm1, %xmm0
463; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
464; SSE2-NEXT:    paddd %xmm0, %xmm1
465; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
466; SSE2-NEXT:    paddd %xmm1, %xmm0
467; SSE2-NEXT:    movd %xmm0, %eax
468; SSE2-NEXT:    retq
469;
470; SSE41-LABEL: test_v8i32_v8i8:
471; SSE41:       # %bb.0:
472; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
473; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
474; SSE41-NEXT:    por %xmm1, %xmm0
475; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
476; SSE41-NEXT:    paddd %xmm0, %xmm1
477; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
478; SSE41-NEXT:    paddd %xmm1, %xmm0
479; SSE41-NEXT:    movd %xmm0, %eax
480; SSE41-NEXT:    retq
481;
482; AVX1-SLOW-LABEL: test_v8i32_v8i8:
483; AVX1-SLOW:       # %bb.0:
484; AVX1-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
485; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
486; AVX1-SLOW-NEXT:    vorps %xmm1, %xmm0, %xmm0
487; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
488; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
489; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
490; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
491; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
492; AVX1-SLOW-NEXT:    vzeroupper
493; AVX1-SLOW-NEXT:    retq
494;
495; AVX1-FAST-LABEL: test_v8i32_v8i8:
496; AVX1-FAST:       # %bb.0:
497; AVX1-FAST-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
498; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
499; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
500; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
501; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
502; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
503; AVX1-FAST-NEXT:    vzeroupper
504; AVX1-FAST-NEXT:    retq
505;
506; AVX2-LABEL: test_v8i32_v8i8:
507; AVX2:       # %bb.0:
508; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
509; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
510; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
511; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
512; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
513; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
514; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
515; AVX2-NEXT:    vmovd %xmm0, %eax
516; AVX2-NEXT:    vzeroupper
517; AVX2-NEXT:    retq
518;
519; AVX512BW-LABEL: test_v8i32_v8i8:
520; AVX512BW:       # %bb.0:
521; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
522; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
523; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
524; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
525; AVX512BW-NEXT:    vmovd %xmm0, %eax
526; AVX512BW-NEXT:    vzeroupper
527; AVX512BW-NEXT:    retq
528;
529; AVX512BWVL-LABEL: test_v8i32_v8i8:
530; AVX512BWVL:       # %bb.0:
531; AVX512BWVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
532; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
533; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
534; AVX512BWVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
535; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
536; AVX512BWVL-NEXT:    vzeroupper
537; AVX512BWVL-NEXT:    retq
538  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64>
539  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
540  ret i32 %2
541}
542
543define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
544; SSE2-LABEL: test_v16i32_v16i8:
545; SSE2:       # %bb.0:
546; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
547; SSE2-NEXT:    pand %xmm4, %xmm2
548; SSE2-NEXT:    pand %xmm4, %xmm0
549; SSE2-NEXT:    paddd %xmm2, %xmm0
550; SSE2-NEXT:    pand %xmm4, %xmm3
551; SSE2-NEXT:    pand %xmm4, %xmm1
552; SSE2-NEXT:    paddd %xmm3, %xmm1
553; SSE2-NEXT:    paddd %xmm0, %xmm1
554; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
555; SSE2-NEXT:    paddd %xmm1, %xmm0
556; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
557; SSE2-NEXT:    paddd %xmm0, %xmm1
558; SSE2-NEXT:    movd %xmm1, %eax
559; SSE2-NEXT:    retq
560;
561; SSE41-LABEL: test_v16i32_v16i8:
562; SSE41:       # %bb.0:
563; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
564; SSE41-NEXT:    pand %xmm4, %xmm2
565; SSE41-NEXT:    pand %xmm4, %xmm0
566; SSE41-NEXT:    paddd %xmm2, %xmm0
567; SSE41-NEXT:    pand %xmm4, %xmm3
568; SSE41-NEXT:    pand %xmm4, %xmm1
569; SSE41-NEXT:    paddd %xmm3, %xmm1
570; SSE41-NEXT:    paddd %xmm0, %xmm1
571; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
572; SSE41-NEXT:    paddd %xmm1, %xmm0
573; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
574; SSE41-NEXT:    paddd %xmm0, %xmm1
575; SSE41-NEXT:    movd %xmm1, %eax
576; SSE41-NEXT:    retq
577;
578; AVX1-SLOW-LABEL: test_v16i32_v16i8:
579; AVX1-SLOW:       # %bb.0:
580; AVX1-SLOW-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
581; AVX1-SLOW-NEXT:    vandps %ymm2, %ymm0, %ymm0
582; AVX1-SLOW-NEXT:    vandps %ymm2, %ymm1, %ymm1
583; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
584; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
585; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
586; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
587; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
588; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
589; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
590; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
591; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
592; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
593; AVX1-SLOW-NEXT:    vzeroupper
594; AVX1-SLOW-NEXT:    retq
595;
596; AVX1-FAST-LABEL: test_v16i32_v16i8:
597; AVX1-FAST:       # %bb.0:
598; AVX1-FAST-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
599; AVX1-FAST-NEXT:    vandps %ymm2, %ymm0, %ymm0
600; AVX1-FAST-NEXT:    vandps %ymm2, %ymm1, %ymm1
601; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
602; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
603; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
604; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
605; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
606; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
607; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
608; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
609; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
610; AVX1-FAST-NEXT:    vzeroupper
611; AVX1-FAST-NEXT:    retq
612;
613; AVX2-LABEL: test_v16i32_v16i8:
614; AVX2:       # %bb.0:
615; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
616; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
617; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
618; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
619; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
620; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
621; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
622; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
623; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
624; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
625; AVX2-NEXT:    vmovd %xmm0, %eax
626; AVX2-NEXT:    vzeroupper
627; AVX2-NEXT:    retq
628;
629; AVX512-LABEL: test_v16i32_v16i8:
630; AVX512:       # %bb.0:
631; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
632; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
633; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
634; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
635; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
636; AVX512-NEXT:    vmovd %xmm0, %eax
637; AVX512-NEXT:    vzeroupper
638; AVX512-NEXT:    retq
639  %1 = and <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
640  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
641  ret i32 %2
642}
643
644define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
645; SSE2-LABEL: test_v32i32_v32i8:
646; SSE2:       # %bb.0:
647; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
648; SSE2-NEXT:    pand %xmm8, %xmm5
649; SSE2-NEXT:    pand %xmm8, %xmm1
650; SSE2-NEXT:    paddd %xmm5, %xmm1
651; SSE2-NEXT:    pand %xmm8, %xmm7
652; SSE2-NEXT:    pand %xmm8, %xmm3
653; SSE2-NEXT:    paddd %xmm7, %xmm3
654; SSE2-NEXT:    paddd %xmm1, %xmm3
655; SSE2-NEXT:    pand %xmm8, %xmm4
656; SSE2-NEXT:    pand %xmm8, %xmm0
657; SSE2-NEXT:    paddd %xmm4, %xmm0
658; SSE2-NEXT:    pand %xmm8, %xmm6
659; SSE2-NEXT:    pand %xmm8, %xmm2
660; SSE2-NEXT:    paddd %xmm6, %xmm2
661; SSE2-NEXT:    paddd %xmm0, %xmm2
662; SSE2-NEXT:    paddd %xmm3, %xmm2
663; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
664; SSE2-NEXT:    paddd %xmm2, %xmm0
665; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
666; SSE2-NEXT:    paddd %xmm0, %xmm1
667; SSE2-NEXT:    movd %xmm1, %eax
668; SSE2-NEXT:    retq
669;
670; SSE41-LABEL: test_v32i32_v32i8:
671; SSE41:       # %bb.0:
672; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = [255,255,255,255]
673; SSE41-NEXT:    pand %xmm8, %xmm5
674; SSE41-NEXT:    pand %xmm8, %xmm1
675; SSE41-NEXT:    paddd %xmm5, %xmm1
676; SSE41-NEXT:    pand %xmm8, %xmm7
677; SSE41-NEXT:    pand %xmm8, %xmm3
678; SSE41-NEXT:    paddd %xmm7, %xmm3
679; SSE41-NEXT:    paddd %xmm1, %xmm3
680; SSE41-NEXT:    pand %xmm8, %xmm4
681; SSE41-NEXT:    pand %xmm8, %xmm0
682; SSE41-NEXT:    paddd %xmm4, %xmm0
683; SSE41-NEXT:    pand %xmm8, %xmm6
684; SSE41-NEXT:    pand %xmm8, %xmm2
685; SSE41-NEXT:    paddd %xmm6, %xmm2
686; SSE41-NEXT:    paddd %xmm0, %xmm2
687; SSE41-NEXT:    paddd %xmm3, %xmm2
688; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
689; SSE41-NEXT:    paddd %xmm2, %xmm0
690; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
691; SSE41-NEXT:    paddd %xmm0, %xmm1
692; SSE41-NEXT:    movd %xmm1, %eax
693; SSE41-NEXT:    retq
694;
695; AVX1-SLOW-LABEL: test_v32i32_v32i8:
696; AVX1-SLOW:       # %bb.0:
697; AVX1-SLOW-NEXT:    vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
698; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm0, %ymm0
699; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm2, %ymm2
700; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm1, %ymm1
701; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm3, %ymm3
702; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
703; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
704; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
705; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
706; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
707; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
708; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm2
709; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
710; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
711; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
712; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
713; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
714; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
715; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
716; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
717; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
718; AVX1-SLOW-NEXT:    vzeroupper
719; AVX1-SLOW-NEXT:    retq
720;
721; AVX1-FAST-LABEL: test_v32i32_v32i8:
722; AVX1-FAST:       # %bb.0:
723; AVX1-FAST-NEXT:    vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
724; AVX1-FAST-NEXT:    vandps %ymm4, %ymm0, %ymm0
725; AVX1-FAST-NEXT:    vandps %ymm4, %ymm2, %ymm2
726; AVX1-FAST-NEXT:    vandps %ymm4, %ymm1, %ymm1
727; AVX1-FAST-NEXT:    vandps %ymm4, %ymm3, %ymm3
728; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
729; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
730; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
731; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
732; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
733; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
734; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm2
735; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
736; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
737; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
738; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
739; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
740; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
741; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
742; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
743; AVX1-FAST-NEXT:    vzeroupper
744; AVX1-FAST-NEXT:    retq
745;
746; AVX2-LABEL: test_v32i32_v32i8:
747; AVX2:       # %bb.0:
748; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
749; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
750; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
751; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
752; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
753; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
754; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
755; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
756; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
757; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
758; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
759; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
760; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
761; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
762; AVX2-NEXT:    vmovd %xmm0, %eax
763; AVX2-NEXT:    vzeroupper
764; AVX2-NEXT:    retq
765;
766; AVX512-LABEL: test_v32i32_v32i8:
767; AVX512:       # %bb.0:
768; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
769; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
770; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
771; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
772; AVX512-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
773; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
774; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
775; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
776; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
777; AVX512-NEXT:    vmovd %xmm0, %eax
778; AVX512-NEXT:    vzeroupper
779; AVX512-NEXT:    retq
780  %1 = and <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
781  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
782  ret i32 %2
783}
784
785;
786; vXi16
787;
788
789define i16 @test_v2i16_v2i8(<2 x i16> %a0) {
790; SSE2-LABEL: test_v2i16_v2i8:
791; SSE2:       # %bb.0:
792; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
793; SSE2-NEXT:    movdqa %xmm0, %xmm1
794; SSE2-NEXT:    psrld $16, %xmm1
795; SSE2-NEXT:    paddw %xmm0, %xmm1
796; SSE2-NEXT:    movd %xmm1, %eax
797; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
798; SSE2-NEXT:    retq
799;
800; SSE41-LABEL: test_v2i16_v2i8:
801; SSE41:       # %bb.0:
802; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
803; SSE41-NEXT:    movdqa %xmm0, %xmm1
804; SSE41-NEXT:    psrld $16, %xmm1
805; SSE41-NEXT:    paddw %xmm0, %xmm1
806; SSE41-NEXT:    movd %xmm1, %eax
807; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
808; SSE41-NEXT:    retq
809;
810; AVX1-SLOW-LABEL: test_v2i16_v2i8:
811; AVX1-SLOW:       # %bb.0:
812; AVX1-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
813; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
814; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
815; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
816; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
817; AVX1-SLOW-NEXT:    retq
818;
819; AVX1-FAST-LABEL: test_v2i16_v2i8:
820; AVX1-FAST:       # %bb.0:
821; AVX1-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
822; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
823; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
824; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
825; AVX1-FAST-NEXT:    retq
826;
827; AVX2-LABEL: test_v2i16_v2i8:
828; AVX2:       # %bb.0:
829; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
830; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
831; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
832; AVX2-NEXT:    vmovd %xmm0, %eax
833; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
834; AVX2-NEXT:    retq
835;
836; AVX512-LABEL: test_v2i16_v2i8:
837; AVX512:       # %bb.0:
838; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
839; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
840; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
841; AVX512-NEXT:    vmovd %xmm0, %eax
842; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
843; AVX512-NEXT:    retq
844  %1 = and <2 x i16> %a0, <i16 255, i16 255>
845  %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1)
846  ret i16 %2
847}
848
849define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
850; SSE2-LABEL: test_v4i16_v4i8:
851; SSE2:       # %bb.0:
852; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
853; SSE2-NEXT:    pandn %xmm0, %xmm1
854; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
855; SSE2-NEXT:    por %xmm1, %xmm0
856; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
857; SSE2-NEXT:    paddw %xmm0, %xmm1
858; SSE2-NEXT:    movdqa %xmm1, %xmm0
859; SSE2-NEXT:    psrld $16, %xmm0
860; SSE2-NEXT:    paddw %xmm1, %xmm0
861; SSE2-NEXT:    movd %xmm0, %eax
862; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
863; SSE2-NEXT:    retq
864;
865; SSE41-LABEL: test_v4i16_v4i8:
866; SSE41:       # %bb.0:
867; SSE41-NEXT:    movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
868; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
869; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
870; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
871; SSE41-NEXT:    paddw %xmm1, %xmm0
872; SSE41-NEXT:    movdqa %xmm0, %xmm1
873; SSE41-NEXT:    psrld $16, %xmm1
874; SSE41-NEXT:    paddw %xmm0, %xmm1
875; SSE41-NEXT:    movd %xmm1, %eax
876; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
877; SSE41-NEXT:    retq
878;
879; AVX1-SLOW-LABEL: test_v4i16_v4i8:
880; AVX1-SLOW:       # %bb.0:
881; AVX1-SLOW-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
882; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
883; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
884; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
885; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
886; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
887; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
888; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
889; AVX1-SLOW-NEXT:    retq
890;
891; AVX1-FAST-LABEL: test_v4i16_v4i8:
892; AVX1-FAST:       # %bb.0:
893; AVX1-FAST-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
894; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
895; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
896; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
897; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
898; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
899; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
900; AVX1-FAST-NEXT:    retq
901;
902; AVX2-LABEL: test_v4i16_v4i8:
903; AVX2:       # %bb.0:
904; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u]
905; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
906; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
907; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
908; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
909; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
910; AVX2-NEXT:    vmovd %xmm0, %eax
911; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
912; AVX2-NEXT:    retq
913;
914; AVX512BW-LABEL: test_v4i16_v4i8:
915; AVX512BW:       # %bb.0:
916; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
917; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
918; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
919; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
920; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
921; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
922; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
923; AVX512BW-NEXT:    vmovd %xmm0, %eax
924; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
925; AVX512BW-NEXT:    vzeroupper
926; AVX512BW-NEXT:    retq
927;
928; AVX512BWVL-LABEL: test_v4i16_v4i8:
929; AVX512BWVL:       # %bb.0:
930; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
931; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
932; AVX512BWVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
933; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
934; AVX512BWVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
935; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
936; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
937; AVX512BWVL-NEXT:    retq
938  %1 = lshr <4 x i16> %a0, <i16 0, i16 1, i16 2, i16 3>
939  %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
940  ret i16 %2
941}
942
943define i16 @test_v8i16_v8i8(<8 x i16> %a0) {
944; SSE2-LABEL: test_v8i16_v8i8:
945; SSE2:       # %bb.0:
946; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
947; SSE2-NEXT:    packuswb %xmm0, %xmm0
948; SSE2-NEXT:    pxor %xmm1, %xmm1
949; SSE2-NEXT:    psadbw %xmm0, %xmm1
950; SSE2-NEXT:    movd %xmm1, %eax
951; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
952; SSE2-NEXT:    retq
953;
954; SSE41-LABEL: test_v8i16_v8i8:
955; SSE41:       # %bb.0:
956; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
957; SSE41-NEXT:    packuswb %xmm0, %xmm0
958; SSE41-NEXT:    pxor %xmm1, %xmm1
959; SSE41-NEXT:    psadbw %xmm0, %xmm1
960; SSE41-NEXT:    movd %xmm1, %eax
961; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
962; SSE41-NEXT:    retq
963;
964; AVX-LABEL: test_v8i16_v8i8:
965; AVX:       # %bb.0:
966; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
967; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
968; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
969; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
970; AVX-NEXT:    vmovd %xmm0, %eax
971; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
972; AVX-NEXT:    retq
973  %1 = and <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>
974  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
975  ret i16 %2
976}
977
978define i16 @test_v16i16_v16i8(<16 x i16> %a0) {
979; SSE2-LABEL: test_v16i16_v16i8:
980; SSE2:       # %bb.0:
981; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
982; SSE2-NEXT:    pand %xmm2, %xmm1
983; SSE2-NEXT:    pand %xmm2, %xmm0
984; SSE2-NEXT:    packuswb %xmm1, %xmm0
985; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
986; SSE2-NEXT:    pxor %xmm1, %xmm1
987; SSE2-NEXT:    psadbw %xmm0, %xmm1
988; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
989; SSE2-NEXT:    paddq %xmm1, %xmm0
990; SSE2-NEXT:    movd %xmm0, %eax
991; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
992; SSE2-NEXT:    retq
993;
994; SSE41-LABEL: test_v16i16_v16i8:
995; SSE41:       # %bb.0:
996; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
997; SSE41-NEXT:    pand %xmm2, %xmm1
998; SSE41-NEXT:    pand %xmm2, %xmm0
999; SSE41-NEXT:    packuswb %xmm1, %xmm0
1000; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1001; SSE41-NEXT:    pxor %xmm1, %xmm1
1002; SSE41-NEXT:    psadbw %xmm0, %xmm1
1003; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1004; SSE41-NEXT:    paddq %xmm1, %xmm0
1005; SSE41-NEXT:    movd %xmm0, %eax
1006; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1007; SSE41-NEXT:    retq
1008;
1009; AVX1-LABEL: test_v16i16_v16i8:
1010; AVX1:       # %bb.0:
1011; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1012; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1013; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1014; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1015; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1016; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1017; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1018; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1019; AVX1-NEXT:    vmovd %xmm0, %eax
1020; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1021; AVX1-NEXT:    vzeroupper
1022; AVX1-NEXT:    retq
1023;
1024; AVX2-LABEL: test_v16i16_v16i8:
1025; AVX2:       # %bb.0:
1026; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1027; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1028; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1029; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1030; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1031; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1032; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1033; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1034; AVX2-NEXT:    vmovd %xmm0, %eax
1035; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1036; AVX2-NEXT:    vzeroupper
1037; AVX2-NEXT:    retq
1038;
1039; AVX512BW-LABEL: test_v16i16_v16i8:
1040; AVX512BW:       # %bb.0:
1041; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1042; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1043; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1044; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1045; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1046; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1047; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1048; AVX512BW-NEXT:    vmovd %xmm0, %eax
1049; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
1050; AVX512BW-NEXT:    vzeroupper
1051; AVX512BW-NEXT:    retq
1052;
1053; AVX512BWVL-LABEL: test_v16i16_v16i8:
1054; AVX512BWVL:       # %bb.0:
1055; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1056; AVX512BWVL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
1057; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1058; AVX512BWVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1059; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1060; AVX512BWVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1061; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1062; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
1063; AVX512BWVL-NEXT:    vzeroupper
1064; AVX512BWVL-NEXT:    retq
1065  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>
1066  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
1067  ret i16 %2
1068}
1069
1070define i16 @test_v32i16_v32i8(<32 x i16> %a0) {
1071; SSE2-LABEL: test_v32i16_v32i8:
1072; SSE2:       # %bb.0:
1073; SSE2-NEXT:    psrlw $8, %xmm1
1074; SSE2-NEXT:    psrlw $8, %xmm0
1075; SSE2-NEXT:    packuswb %xmm1, %xmm0
1076; SSE2-NEXT:    psrlw $8, %xmm3
1077; SSE2-NEXT:    psrlw $8, %xmm2
1078; SSE2-NEXT:    packuswb %xmm3, %xmm2
1079; SSE2-NEXT:    pxor %xmm1, %xmm1
1080; SSE2-NEXT:    psadbw %xmm1, %xmm2
1081; SSE2-NEXT:    psadbw %xmm1, %xmm0
1082; SSE2-NEXT:    paddq %xmm2, %xmm0
1083; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1084; SSE2-NEXT:    paddq %xmm0, %xmm1
1085; SSE2-NEXT:    movd %xmm1, %eax
1086; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1087; SSE2-NEXT:    retq
1088;
1089; SSE41-LABEL: test_v32i16_v32i8:
1090; SSE41:       # %bb.0:
1091; SSE41-NEXT:    psrlw $8, %xmm1
1092; SSE41-NEXT:    psrlw $8, %xmm0
1093; SSE41-NEXT:    packuswb %xmm1, %xmm0
1094; SSE41-NEXT:    psrlw $8, %xmm3
1095; SSE41-NEXT:    psrlw $8, %xmm2
1096; SSE41-NEXT:    packuswb %xmm3, %xmm2
1097; SSE41-NEXT:    pxor %xmm1, %xmm1
1098; SSE41-NEXT:    psadbw %xmm1, %xmm2
1099; SSE41-NEXT:    psadbw %xmm1, %xmm0
1100; SSE41-NEXT:    paddq %xmm2, %xmm0
1101; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1102; SSE41-NEXT:    paddq %xmm0, %xmm1
1103; SSE41-NEXT:    movd %xmm1, %eax
1104; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1105; SSE41-NEXT:    retq
1106;
1107; AVX1-LABEL: test_v32i16_v32i8:
1108; AVX1:       # %bb.0:
1109; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1110; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1111; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1112; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1113; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1114; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1115; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1116; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1117; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1118; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
1119; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
1120; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1121; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1122; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1123; AVX1-NEXT:    vmovd %xmm0, %eax
1124; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1125; AVX1-NEXT:    vzeroupper
1126; AVX1-NEXT:    retq
1127;
1128; AVX2-LABEL: test_v32i16_v32i8:
1129; AVX2:       # %bb.0:
1130; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1131; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1132; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1133; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1134; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1135; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
1136; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1137; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1138; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1139; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1140; AVX2-NEXT:    vmovd %xmm0, %eax
1141; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1142; AVX2-NEXT:    vzeroupper
1143; AVX2-NEXT:    retq
1144;
1145; AVX512-LABEL: test_v32i16_v32i8:
1146; AVX512:       # %bb.0:
1147; AVX512-NEXT:    vpsrlw $8, %zmm0, %zmm0
1148; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
1149; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1150; AVX512-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
1151; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1152; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1153; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1154; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1155; AVX512-NEXT:    vmovd %xmm0, %eax
1156; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1157; AVX512-NEXT:    vzeroupper
1158; AVX512-NEXT:    retq
1159  %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1160  %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
1161  ret i16 %2
1162}
1163
1164define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
1165; SSE2-LABEL: test_v64i16_v64i8:
1166; SSE2:       # %bb.0:
1167; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127]
1168; SSE2-NEXT:    pand %xmm8, %xmm1
1169; SSE2-NEXT:    pand %xmm8, %xmm0
1170; SSE2-NEXT:    packuswb %xmm1, %xmm0
1171; SSE2-NEXT:    pand %xmm8, %xmm5
1172; SSE2-NEXT:    pand %xmm8, %xmm4
1173; SSE2-NEXT:    packuswb %xmm5, %xmm4
1174; SSE2-NEXT:    paddb %xmm0, %xmm4
1175; SSE2-NEXT:    pand %xmm8, %xmm3
1176; SSE2-NEXT:    pand %xmm8, %xmm2
1177; SSE2-NEXT:    packuswb %xmm3, %xmm2
1178; SSE2-NEXT:    pand %xmm8, %xmm7
1179; SSE2-NEXT:    pand %xmm8, %xmm6
1180; SSE2-NEXT:    packuswb %xmm7, %xmm6
1181; SSE2-NEXT:    paddb %xmm2, %xmm6
1182; SSE2-NEXT:    pxor %xmm0, %xmm0
1183; SSE2-NEXT:    psadbw %xmm0, %xmm6
1184; SSE2-NEXT:    psadbw %xmm0, %xmm4
1185; SSE2-NEXT:    paddq %xmm6, %xmm4
1186; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
1187; SSE2-NEXT:    paddq %xmm4, %xmm0
1188; SSE2-NEXT:    movd %xmm0, %eax
1189; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
1190; SSE2-NEXT:    retq
1191;
1192; SSE41-LABEL: test_v64i16_v64i8:
1193; SSE41:       # %bb.0:
1194; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127]
1195; SSE41-NEXT:    pand %xmm8, %xmm1
1196; SSE41-NEXT:    pand %xmm8, %xmm0
1197; SSE41-NEXT:    packuswb %xmm1, %xmm0
1198; SSE41-NEXT:    pand %xmm8, %xmm5
1199; SSE41-NEXT:    pand %xmm8, %xmm4
1200; SSE41-NEXT:    packuswb %xmm5, %xmm4
1201; SSE41-NEXT:    paddb %xmm0, %xmm4
1202; SSE41-NEXT:    pand %xmm8, %xmm3
1203; SSE41-NEXT:    pand %xmm8, %xmm2
1204; SSE41-NEXT:    packuswb %xmm3, %xmm2
1205; SSE41-NEXT:    pand %xmm8, %xmm7
1206; SSE41-NEXT:    pand %xmm8, %xmm6
1207; SSE41-NEXT:    packuswb %xmm7, %xmm6
1208; SSE41-NEXT:    paddb %xmm2, %xmm6
1209; SSE41-NEXT:    pxor %xmm0, %xmm0
1210; SSE41-NEXT:    psadbw %xmm0, %xmm6
1211; SSE41-NEXT:    psadbw %xmm0, %xmm4
1212; SSE41-NEXT:    paddq %xmm6, %xmm4
1213; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
1214; SSE41-NEXT:    paddq %xmm4, %xmm0
1215; SSE41-NEXT:    movd %xmm0, %eax
1216; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
1217; SSE41-NEXT:    retq
1218;
1219; AVX1-LABEL: test_v64i16_v64i8:
1220; AVX1:       # %bb.0:
1221; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1222; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1223; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1224; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1225; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1226; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
1227; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
1228; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1229; AVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
1230; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
1231; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1232; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
1233; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
1234; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
1235; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
1236; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
1237; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1238; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
1239; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1240; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1241; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1242; AVX1-NEXT:    vmovd %xmm0, %eax
1243; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1244; AVX1-NEXT:    vzeroupper
1245; AVX1-NEXT:    retq
1246;
1247; AVX2-LABEL: test_v64i16_v64i8:
1248; AVX2:       # %bb.0:
1249; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1250; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1251; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1252; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1253; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm1
1254; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1255; AVX2-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
1256; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1257; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1258; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1259; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
1260; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1261; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1262; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1263; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1264; AVX2-NEXT:    vmovd %xmm0, %eax
1265; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1266; AVX2-NEXT:    vzeroupper
1267; AVX2-NEXT:    retq
1268;
1269; AVX512-LABEL: test_v64i16_v64i8:
1270; AVX512:       # %bb.0:
1271; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
1272; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
1273; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1274; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1275; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1276; AVX512-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
1277; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1278; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1279; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1280; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1281; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1282; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1283; AVX512-NEXT:    vmovd %xmm0, %eax
1284; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1285; AVX512-NEXT:    vzeroupper
1286; AVX512-NEXT:    retq
1287  %1 = and <64 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
1288  %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
1289  ret i16 %2
1290}
1291
1292declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1293declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1294declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1295declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1296
1297declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1298declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1299declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1300declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1301declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
1302
1303declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
1304declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1305declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1306declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1307declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
1308declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
1309
1310declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
1311declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
1312declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1313declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1314declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
1315declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
1316declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
1317