xref: /llvm-project/llvm/test/CodeGen/X86/horizontal-sum.ll (revision 8e7618aa21652132f930b6576b92291c5f1d46b6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
8
9; Vectorized Pairwise Sum Reductions
10; e.g.
11; inline STYPE sum(VTYPE x) {
12;   return (x[0] + x[1]) + (x[2] + x[3]);
13; }
14;
15; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
16;   return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
17; }
18
19define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
20; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21; SSSE3-SLOW:       # %bb.0:
22; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
23; SSSE3-SLOW-NEXT:    haddps %xmm2, %xmm3
24; SSSE3-SLOW-NEXT:    haddps %xmm3, %xmm0
25; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
26; SSSE3-SLOW-NEXT:    retq
27;
28; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
29; SSSE3-FAST:       # %bb.0:
30; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
31; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm2
32; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm0
33; SSSE3-FAST-NEXT:    retq
34;
35; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32:
36; AVX-SLOW:       # %bb.0:
37; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
38; AVX-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
39; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
40; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
41; AVX-SLOW-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
42; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
43; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
44; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
45; AVX-SLOW-NEXT:    retq
46;
47; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
48; AVX-FAST:       # %bb.0:
49; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
50; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm1
51; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
52; AVX-FAST-NEXT:    retq
53  %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
54  %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
55  %7 = fadd <2 x float> %5, %6
56  %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
57  %9 = fadd <2 x float> %7, %8
58  %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
59  %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
60  %12 = fadd <2 x float> %10, %11
61  %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
62  %14 = fadd <2 x float> %12, %13
63  %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
64  %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
65  %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
66  %18 = fadd <2 x float> %16, %17
67  %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
68  %20 = fadd <2 x float> %18, %19
69  %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70  %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
71  %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
72  %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
73  %25 = fadd <2 x float> %23, %24
74  %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
75  %27 = fadd <2 x float> %25, %26
76  %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
77  %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
78  ret <4 x float> %29
79}
80
81define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
82; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
83; SSSE3-SLOW:       # %bb.0:
84; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
85; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
86; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
87; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
88; SSSE3-SLOW-NEXT:    phaddd %xmm2, %xmm3
89; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
90; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
91; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
92; SSSE3-SLOW-NEXT:    retq
93;
94; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
95; SSSE3-FAST:       # %bb.0:
96; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
97; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
98; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm0
99; SSSE3-FAST-NEXT:    retq
100;
101; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
102; AVX1-SLOW:       # %bb.0:
103; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
104; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
105; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
106; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
107; AVX1-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
108; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
109; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
110; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
111; AVX1-SLOW-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
112; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
113; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
114; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
115; AVX1-SLOW-NEXT:    retq
116;
117; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
118; AVX-FAST:       # %bb.0:
119; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
120; AVX-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm1
121; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
122; AVX-FAST-NEXT:    retq
123;
124; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
125; AVX2-SLOW:       # %bb.0:
126; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
127; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
128; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
129; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
130; AVX2-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
131; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
132; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
133; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
134; AVX2-SLOW-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
135; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm2
136; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
137; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
138; AVX2-SLOW-NEXT:    retq
139  %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
140  %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
141  %7 = add <2 x i32> %5, %6
142  %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
143  %9 = add <2 x i32> %7, %8
144  %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
145  %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
146  %12 = add <2 x i32> %10, %11
147  %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
148  %14 = add <2 x i32> %12, %13
149  %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
150  %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
151  %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
152  %18 = add <2 x i32> %16, %17
153  %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
154  %20 = add <2 x i32> %18, %19
155  %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
156  %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
157  %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
158  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
159  %25 = add <2 x i32> %23, %24
160  %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
161  %27 = add <2 x i32> %25, %26
162  %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
163  %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
164  ret <4 x i32> %29
165}
166
167define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
168; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
169; SSSE3-SLOW:       # %bb.0:
170; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm0
171; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
172; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
173; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
174; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
175; SSSE3-SLOW-NEXT:    haddps %xmm3, %xmm2
176; SSSE3-SLOW-NEXT:    haddps %xmm4, %xmm5
177; SSSE3-SLOW-NEXT:    haddps %xmm5, %xmm2
178; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
179; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
180; SSSE3-SLOW-NEXT:    haddps %xmm7, %xmm6
181; SSSE3-SLOW-NEXT:    haddps %xmm6, %xmm6
182; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
183; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
184; SSSE3-SLOW-NEXT:    retq
185;
186; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
187; SSSE3-FAST:       # %bb.0:
188; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
189; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
190; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm2
191; SSSE3-FAST-NEXT:    haddps %xmm5, %xmm4
192; SSSE3-FAST-NEXT:    haddps %xmm4, %xmm2
193; SSSE3-FAST-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
194; SSSE3-FAST-NEXT:    haddps %xmm7, %xmm6
195; SSSE3-FAST-NEXT:    haddps %xmm6, %xmm4
196; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm1
197; SSSE3-FAST-NEXT:    retq
198;
199; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
200; AVX1-SLOW:       # %bb.0:
201; AVX1-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
202; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
203; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
204; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
205; AVX1-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
206; AVX1-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
207; AVX1-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
208; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
209; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
210; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
211; AVX1-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
212; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
213; AVX1-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
214; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
215; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
216; AVX1-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
217; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
218; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
219; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
220; AVX1-SLOW-NEXT:    retq
221;
222; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
223; AVX1-FAST:       # %bb.0:
224; AVX1-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
225; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
226; AVX1-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
227; AVX1-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
228; AVX1-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
229; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
230; AVX1-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
231; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
232; AVX1-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
233; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
234; AVX1-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
235; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
236; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
237; AVX1-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
238; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
239; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
240; AVX1-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
241; AVX1-FAST-NEXT:    retq
242;
243; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
244; AVX2-SLOW:       # %bb.0:
245; AVX2-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
246; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
247; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
248; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
249; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
250; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
251; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
252; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
253; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
254; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
255; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
256; AVX2-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
257; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
258; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
259; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
260; AVX2-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
261; AVX2-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
262; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
263; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
264; AVX2-SLOW-NEXT:    retq
265;
266; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
267; AVX2-FAST:       # %bb.0:
268; AVX2-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
269; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
270; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
271; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
272; AVX2-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
273; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
274; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
275; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
276; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
277; AVX2-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
278; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279; AVX2-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
280; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
281; AVX2-FAST-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
282; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
283; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
284; AVX2-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
285; AVX2-FAST-NEXT:    retq
286  %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
287  %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
288  %11 = fadd <2 x float> %9, %10
289  %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
290  %13 = fadd <2 x float> %11, %12
291  %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
292  %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
293  %16 = fadd <2 x float> %14, %15
294  %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
295  %18 = fadd <2 x float> %16, %17
296  %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
297  %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
298  %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
299  %22 = fadd <2 x float> %20, %21
300  %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
301  %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
302  %25 = fadd <2 x float> %23, %24
303  %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2>
304  %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3>
305  %28 = fadd <2 x float> %26, %27
306  %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
307  %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308  %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309  %32 = fadd <2 x float> %30, %31
310  %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
311  %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
312  %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
313  %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
314  %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
315  %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
316  %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
317  %40 = fadd <4 x float> %36, %39
318  %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
319  %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
320  %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2>
321  %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3>
322  %45 = fadd <2 x float> %43, %44
323  %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2>
324  %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3>
325  %48 = fadd <2 x float> %46, %47
326  %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2>
327  %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3>
328  %51 = fadd <2 x float> %49, %50
329  %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
330  %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
331  ret <8 x float> %53
332}
333
334define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
335; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
336; SSSE3-SLOW:       # %bb.0:
337; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
338; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
339; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
340; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
341; SSSE3-SLOW-NEXT:    phaddd %xmm3, %xmm2
342; SSSE3-SLOW-NEXT:    phaddd %xmm4, %xmm5
343; SSSE3-SLOW-NEXT:    phaddd %xmm5, %xmm2
344; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
345; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
346; SSSE3-SLOW-NEXT:    phaddd %xmm7, %xmm6
347; SSSE3-SLOW-NEXT:    phaddd %xmm6, %xmm6
348; SSSE3-SLOW-NEXT:    palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349; SSSE3-SLOW-NEXT:    movdqa %xmm6, %xmm1
350; SSSE3-SLOW-NEXT:    retq
351;
352; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
353; SSSE3-FAST:       # %bb.0:
354; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
355; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
356; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
357; SSSE3-FAST-NEXT:    phaddd %xmm5, %xmm4
358; SSSE3-FAST-NEXT:    phaddd %xmm4, %xmm2
359; SSSE3-FAST-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
360; SSSE3-FAST-NEXT:    phaddd %xmm6, %xmm6
361; SSSE3-FAST-NEXT:    phaddd %xmm7, %xmm7
362; SSSE3-FAST-NEXT:    phaddd %xmm7, %xmm6
363; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
364; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
365; SSSE3-FAST-NEXT:    retq
366;
367; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
368; AVX1-SLOW:       # %bb.0:
369; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
370; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
371; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
372; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
373; AVX1-SLOW-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
374; AVX1-SLOW-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
375; AVX1-SLOW-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
376; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
377; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
378; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
379; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
380; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
381; AVX1-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
382; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
383; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
384; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
385; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
386; AVX1-SLOW-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
387; AVX1-SLOW-NEXT:    vphaddd %xmm2, %xmm2, %xmm2
388; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
389; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
390; AVX1-SLOW-NEXT:    retq
391;
392; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32:
393; AVX1-FAST:       # %bb.0:
394; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
395; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
396; AVX1-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
397; AVX1-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
398; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
399; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
401; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
404; AVX1-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
405; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
406; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
407; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
408; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
409; AVX1-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
410; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm2, %xmm2
411; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
412; AVX1-FAST-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
413; AVX1-FAST-NEXT:    retq
414;
415; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
416; AVX2-SLOW:       # %bb.0:
417; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
418; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
419; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
420; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
421; AVX2-SLOW-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
422; AVX2-SLOW-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
423; AVX2-SLOW-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
424; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
426; AVX2-SLOW-NEXT:    vpbroadcastd %xmm4, %xmm5
427; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
429; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
430; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
431; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
432; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
433; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
434; AVX2-SLOW-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
435; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm1, %xmm1
436; AVX2-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
437; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
438; AVX2-SLOW-NEXT:    retq
439;
440; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32:
441; AVX2-FAST:       # %bb.0:
442; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
443; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
444; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
445; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
446; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
447; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
449; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm5
450; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
452; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
453; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
454; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
455; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
456; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
457; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
458; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm1
459; AVX2-FAST-NEXT:    vpbroadcastq %xmm1, %ymm1
460; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
461; AVX2-FAST-NEXT:    retq
462  %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
463  %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
464  %11 = add <2 x i32> %9, %10
465  %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
466  %13 = add <2 x i32> %11, %12
467  %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
468  %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
469  %16 = add <2 x i32> %14, %15
470  %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
471  %18 = add <2 x i32> %16, %17
472  %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
473  %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
474  %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
475  %22 = add <2 x i32> %20, %21
476  %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
477  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
478  %25 = add <2 x i32> %23, %24
479  %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
480  %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
481  %28 = add <2 x i32> %26, %27
482  %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
483  %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
484  %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
485  %32 = add <2 x i32> %30, %31
486  %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
487  %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
488  %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
489  %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
490  %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
491  %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
492  %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
493  %40 = add <4 x i32> %36, %39
494  %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
495  %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
496  %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
497  %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
498  %45 = add <2 x i32> %43, %44
499  %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
500  %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
501  %48 = add <2 x i32> %46, %47
502  %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2>
503  %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3>
504  %51 = add <2 x i32> %49, %50
505  %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
506  %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
507  ret <8 x i32> %53
508}
509
510; Vectorized Sequential Sum Reductions
511; e.g.
512; inline STYPE sum(VTYPE x) {
513;   return ((x[0] + x[1]) + x[2]) + x[3];
514; }
515;
516; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
517;   return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
518; }
519
520define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
521; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
522; SSSE3-SLOW:       # %bb.0:
523; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
524; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm5
525; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
526; SSSE3-SLOW-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
527; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
528; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
529; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm0
530; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
531; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
532; SSSE3-SLOW-NEXT:    addps %xmm5, %xmm4
533; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
534; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm4
535; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
536; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm0
537; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm1
538; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
539; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
540; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
541; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm3
542; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
543; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
544; SSSE3-SLOW-NEXT:    movaps %xmm4, %xmm0
545; SSSE3-SLOW-NEXT:    retq
546;
547; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
548; SSSE3-FAST:       # %bb.0:
549; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
550; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
551; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm5
552; SSSE3-FAST-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
553; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
554; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
555; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
556; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm2
557; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
558; SSSE3-FAST-NEXT:    addps %xmm5, %xmm0
559; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
560; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
561; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm1
562; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm2
563; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
564; SSSE3-FAST-NEXT:    addps %xmm1, %xmm2
565; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
566; SSSE3-FAST-NEXT:    addps %xmm2, %xmm3
567; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
568; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
569; SSSE3-FAST-NEXT:    retq
570;
571; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
572; AVX-SLOW:       # %bb.0:
573; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm4
574; AVX-SLOW-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
575; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
576; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
577; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
578; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
579; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
580; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
581; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
582; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
583; AVX-SLOW-NEXT:    vaddps %xmm3, %xmm4, %xmm4
584; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
585; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
586; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
587; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
588; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
589; AVX-SLOW-NEXT:    retq
590;
591; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
592; AVX-FAST:       # %bb.0:
593; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm4
594; AVX-FAST-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
595; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
596; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
597; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
598; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
599; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
600; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
601; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm4
602; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
603; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
604; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
605; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
606; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
607; AVX-FAST-NEXT:    retq
608  %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
609  %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
610  %7 = fadd <2 x float> %5, %6
611  %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6>
612  %9 = fadd <2 x float> %8, %7
613  %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7>
614  %11 = fadd <2 x float> %10, %9
615  %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
616  %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
617  %14 = fadd <4 x float> %13, %2
618  %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
619  %16 = fadd <4 x float> %15, %14
620  %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
621  %18 = fadd <4 x float> %17, %16
622  %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
623  %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
624  %21 = fadd <4 x float> %20, %3
625  %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
626  %23 = fadd <4 x float> %22, %21
627  %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
628  %25 = fadd <4 x float> %24, %23
629  %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
630  ret <4 x float> %26
631}
632
633define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
634; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32:
635; SSSE3-SLOW:       # %bb.0:
636; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm4
637; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm4
638; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
639; SSSE3-SLOW-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
640; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
641; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
642; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
643; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
644; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm5
645; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
646; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
647; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
648; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm6
649; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
650; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
651; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
652; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
653; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
654; SSSE3-SLOW-NEXT:    retq
655;
656; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
657; SSSE3-FAST:       # %bb.0:
658; SSSE3-FAST-NEXT:    movdqa %xmm0, %xmm4
659; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm4
660; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
661; SSSE3-FAST-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
662; SSSE3-FAST-NEXT:    paddd %xmm0, %xmm4
663; SSSE3-FAST-NEXT:    movdqa %xmm2, %xmm1
664; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm1
665; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
666; SSSE3-FAST-NEXT:    movdqa %xmm3, %xmm5
667; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm5
668; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
669; SSSE3-FAST-NEXT:    paddd %xmm5, %xmm6
670; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
671; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
672; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
673; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
674; SSSE3-FAST-NEXT:    paddd %xmm4, %xmm0
675; SSSE3-FAST-NEXT:    retq
676;
677; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
678; AVX1-SLOW:       # %bb.0:
679; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
680; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
681; AVX1-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
682; AVX1-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
683; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
684; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
685; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
686; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
687; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
688; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
689; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
690; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
691; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
692; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
693; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
694; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
695; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
696; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
697; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
698; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
699; AVX1-SLOW-NEXT:    retq
700;
701; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32:
702; AVX1-FAST:       # %bb.0:
703; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
704; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
705; AVX1-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
706; AVX1-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
707; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
708; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
709; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
710; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
711; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
712; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
713; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
714; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
715; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
716; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
717; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
718; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
719; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
720; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
721; AVX1-FAST-NEXT:    retq
722;
723; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32:
724; AVX2-SLOW:       # %bb.0:
725; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
726; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
727; AVX2-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
728; AVX2-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
729; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
730; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
731; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
732; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
733; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
734; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
735; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
736; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
737; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
738; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm1
739; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm2
740; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
741; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
742; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
743; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
744; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
745; AVX2-SLOW-NEXT:    retq
746;
747; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
748; AVX2-FAST:       # %bb.0:
749; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
750; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
751; AVX2-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
752; AVX2-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
753; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
754; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
755; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
756; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
757; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
758; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
759; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
760; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
761; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
762; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
763; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1
764; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
765; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
766; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
767; AVX2-FAST-NEXT:    retq
768  %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
769  %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
770  %7 = add <2 x i32> %5, %6
771  %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6>
772  %9 = add <2 x i32> %8, %7
773  %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7>
774  %11 = add <2 x i32> %10, %9
775  %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
776  %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
777  %14 = add <4 x i32> %13, %2
778  %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
779  %16 = add <4 x i32> %15, %14
780  %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
781  %18 = add <4 x i32> %17, %16
782  %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
783  %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
784  %21 = add <4 x i32> %20, %3
785  %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
786  %23 = add <4 x i32> %22, %21
787  %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
788  %25 = add <4 x i32> %24, %23
789  %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
790  ret <4 x i32> %26
791}
792
793; Vectorized Reductions
794; e.g.
795; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
796;   return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) };
797; }
798
799define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
800; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
801; SSSE3-SLOW:       # %bb.0:
802; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
803; SSSE3-SLOW-NEXT:    addss %xmm0, %xmm4
804; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
805; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
806; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
807; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
808; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm0
809; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
810; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
811; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
812; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
813; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
814; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
815; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm1
816; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
817; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
818; SSSE3-SLOW-NEXT:    addss %xmm2, %xmm1
819; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm4
820; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
821; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
822; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
823; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm2
824; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
825; SSSE3-SLOW-NEXT:    addss %xmm3, %xmm1
826; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm4
827; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
828; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
829; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
830; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm3
831; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
832; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
833; SSSE3-SLOW-NEXT:    retq
834;
835; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
836; SSSE3-FAST:       # %bb.0:
837; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
838; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm4
839; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
840; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
841; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
842; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
843; SSSE3-FAST-NEXT:    addss %xmm5, %xmm0
844; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm4
845; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm4
846; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm5
847; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
848; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
849; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
850; SSSE3-FAST-NEXT:    addss %xmm5, %xmm1
851; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
852; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
853; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm1
854; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm4
855; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
856; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
857; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
858; SSSE3-FAST-NEXT:    addss %xmm4, %xmm2
859; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
860; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm1
861; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm4
862; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
863; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
864; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
865; SSSE3-FAST-NEXT:    addss %xmm4, %xmm3
866; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
867; SSSE3-FAST-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
868; SSSE3-FAST-NEXT:    retq
869;
870; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
871; AVX-SLOW:       # %bb.0:
872; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
873; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm0, %xmm4
874; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm5 = xmm0[1,0]
875; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
876; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
877; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm4, %xmm0
878; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
879; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm4
880; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
881; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
882; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
883; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm4, %xmm1
884; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
885; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
886; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm2, %xmm1
887; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
888; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm1
889; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
890; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
891; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
892; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
893; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm3, %xmm1
894; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm3[1,0]
895; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
896; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
897; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
898; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
899; AVX-SLOW-NEXT:    retq
900;
901; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
902; AVX-FAST:       # %bb.0:
903; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm4
904; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm5 = xmm0[1,0]
905; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
906; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
907; AVX-FAST-NEXT:    vaddss %xmm0, %xmm4, %xmm0
908; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm4
909; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
910; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
911; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
912; AVX-FAST-NEXT:    vaddss %xmm1, %xmm4, %xmm1
913; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
914; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
915; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
916; AVX-FAST-NEXT:    vaddss %xmm4, %xmm1, %xmm1
917; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
918; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
919; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
920; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
921; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm3[1,0]
922; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
923; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
924; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
925; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
926; AVX-FAST-NEXT:    retq
927  %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
928  %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
929  %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
930  %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
931  %9 = insertelement <4 x float> undef, float %5, i32 0
932  %10 = insertelement <4 x float> %9,   float %6, i32 1
933  %11 = insertelement <4 x float> %10,  float %7, i32 2
934  %12 = insertelement <4 x float> %11,  float %8, i32 3
935  ret <4 x float> %12
936}
937declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
938
939define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
940; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
941; SSSE3-SLOW:       # %bb.0:
942; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
943; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
944; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
945; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
946; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
947; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
948; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm5
949; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
950; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
951; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
952; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
953; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
954; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm2
955; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
956; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm2
957; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm3
958; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
959; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
960; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
961; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
962; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
963; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
964; SSSE3-SLOW-NEXT:    retq
965;
966; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
967; SSSE3-FAST:       # %bb.0:
968; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
969; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
970; SSSE3-FAST-NEXT:    addps %xmm4, %xmm0
971; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm4
972; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
973; SSSE3-FAST-NEXT:    addps %xmm1, %xmm4
974; SSSE3-FAST-NEXT:    haddps %xmm4, %xmm0
975; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
976; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
977; SSSE3-FAST-NEXT:    addps %xmm2, %xmm1
978; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm2
979; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
980; SSSE3-FAST-NEXT:    addps %xmm3, %xmm2
981; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm1
982; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
983; SSSE3-FAST-NEXT:    retq
984;
985; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
986; AVX-SLOW:       # %bb.0:
987; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
988; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
989; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
990; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm1, %xmm1
991; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
992; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm2, %xmm2
993; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm3[1,0]
994; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm3, %xmm3
995; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
996; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
997; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
998; AVX-SLOW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
999; AVX-SLOW-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1000; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1001; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1002; AVX-SLOW-NEXT:    retq
1003;
1004; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1005; AVX-FAST:       # %bb.0:
1006; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1007; AVX-FAST-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1008; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1009; AVX-FAST-NEXT:    vaddps %xmm4, %xmm1, %xmm1
1010; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1011; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1012; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
1013; AVX-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1014; AVX-FAST-NEXT:    vaddps %xmm2, %xmm3, %xmm2
1015; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm1, %xmm1
1016; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1017; AVX-FAST-NEXT:    retq
1018  %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1019  %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
1020  %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
1021  %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
1022  %9 = insertelement <4 x float> undef, float %5, i32 0
1023  %10 = insertelement <4 x float> %9,   float %6, i32 1
1024  %11 = insertelement <4 x float> %10,  float %7, i32 2
1025  %12 = insertelement <4 x float> %11,  float %8, i32 3
1026  ret <4 x float> %12
1027}
1028
1029define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
1030; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1031; SSSE3-SLOW:       # %bb.0:
1032; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1033; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
1034; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1035; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1036; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
1037; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1038; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1039; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1040; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm1
1041; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1042; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1043; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm6
1044; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
1045; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1046; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1047; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1048; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1049; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1050; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
1051; SSSE3-SLOW-NEXT:    retq
1052;
1053; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1054; SSSE3-FAST:       # %bb.0:
1055; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1056; SSSE3-FAST-NEXT:    paddd %xmm4, %xmm0
1057; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1058; SSSE3-FAST-NEXT:    paddd %xmm1, %xmm4
1059; SSSE3-FAST-NEXT:    phaddd %xmm4, %xmm0
1060; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1061; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
1062; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1063; SSSE3-FAST-NEXT:    paddd %xmm3, %xmm2
1064; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm1
1065; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066; SSSE3-FAST-NEXT:    retq
1067;
1068; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1069; AVX1-SLOW:       # %bb.0:
1070; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1071; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1072; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1073; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1074; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
1075; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
1076; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1077; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1078; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1079; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1080; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1081; AVX1-SLOW-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
1082; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
1083; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1084; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1085; AVX1-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1086; AVX1-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1087; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1088; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1089; AVX1-SLOW-NEXT:    retq
1090;
1091; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1092; AVX-FAST:       # %bb.0:
1093; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1094; AVX-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1095; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1096; AVX-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
1097; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1098; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1099; AVX-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
1100; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1101; AVX-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
1102; AVX-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
1103; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1104; AVX-FAST-NEXT:    retq
1105;
1106; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1107; AVX2-SLOW:       # %bb.0:
1108; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1109; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1110; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1111; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1112; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
1113; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1114; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1115; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
1116; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
1117; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1118; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
1119; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
1120; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm1
1122; AVX2-SLOW-NEXT:    vpbroadcastd %xmm2, %xmm2
1123; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1124; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1125; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1126; AVX2-SLOW-NEXT:    retq
1127  %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
1128  %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
1129  %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
1130  %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3)
1131  %9 = insertelement <4 x i32> undef, i32 %5, i32 0
1132  %10 = insertelement <4 x i32> %9,   i32 %6, i32 1
1133  %11 = insertelement <4 x i32> %10,  i32 %7, i32 2
1134  %12 = insertelement <4 x i32> %11,  i32 %8, i32 3
1135  ret <4 x i32> %12
1136}
1137declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
1138