xref: /llvm-project/llvm/test/CodeGen/X86/haddsub-shuf.ll (revision 6756947ac6ef35f774817995c3e052ad48c83144)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
10
11; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
12; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
13
14define <4 x float> @hadd_v4f32(<4 x float> %a) {
15; SSE-LABEL: hadd_v4f32:
16; SSE:       # %bb.0:
17; SSE-NEXT:    haddps %xmm0, %xmm0
18; SSE-NEXT:    retq
19;
20; AVX-LABEL: hadd_v4f32:
21; AVX:       # %bb.0:
22; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
23; AVX-NEXT:    retq
24  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
25  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
26  %hop = fadd <2 x float> %a02, %a13
27  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
28  ret <4 x float> %shuf
29}
30
31define <8 x float> @hadd_v8f32a(<8 x float> %a) {
32; SSE_SLOW-LABEL: hadd_v8f32a:
33; SSE_SLOW:       # %bb.0:
34; SSE_SLOW-NEXT:    movaps %xmm0, %xmm2
35; SSE_SLOW-NEXT:    haddps %xmm1, %xmm2
36; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
37; SSE_SLOW-NEXT:    movaps %xmm2, %xmm1
38; SSE_SLOW-NEXT:    retq
39;
40; SSE_FAST-LABEL: hadd_v8f32a:
41; SSE_FAST:       # %bb.0:
42; SSE_FAST-NEXT:    movaps %xmm0, %xmm2
43; SSE_FAST-NEXT:    haddps %xmm1, %xmm2
44; SSE_FAST-NEXT:    haddps %xmm0, %xmm0
45; SSE_FAST-NEXT:    movaps %xmm2, %xmm1
46; SSE_FAST-NEXT:    retq
47;
48; AVX1_SLOW-LABEL: hadd_v8f32a:
49; AVX1_SLOW:       # %bb.0:
50; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
51; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
52; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
53; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
54; AVX1_SLOW-NEXT:    retq
55;
56; AVX1_FAST-LABEL: hadd_v8f32a:
57; AVX1_FAST:       # %bb.0:
58; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
59; AVX1_FAST-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
60; AVX1_FAST-NEXT:    retq
61;
62; AVX2-LABEL: hadd_v8f32a:
63; AVX2:       # %bb.0:
64; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
65; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
66; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
67; AVX2-NEXT:    retq
68  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
69  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
70  %hop = fadd <4 x float> %a0, %a1
71  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
72  ret <8 x float> %shuf
73}
74
75define <8 x float> @hadd_v8f32b(<8 x float> %a) {
76; SSE-LABEL: hadd_v8f32b:
77; SSE:       # %bb.0:
78; SSE-NEXT:    haddps %xmm0, %xmm0
79; SSE-NEXT:    haddps %xmm1, %xmm1
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: hadd_v8f32b:
83; AVX:       # %bb.0:
84; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
85; AVX-NEXT:    retq
86  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
87  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
88  %hop = fadd <8 x float> %a0, %a1
89  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
90  ret <8 x float> %shuf
91}
92
93define <4 x float> @hsub_v4f32(<4 x float> %a) {
94; SSE-LABEL: hsub_v4f32:
95; SSE:       # %bb.0:
96; SSE-NEXT:    hsubps %xmm0, %xmm0
97; SSE-NEXT:    retq
98;
99; AVX-LABEL: hsub_v4f32:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
102; AVX-NEXT:    retq
103  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
104  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
105  %hop = fsub <2 x float> %a02, %a13
106  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
107  ret <4 x float> %shuf
108}
109
110define <8 x float> @hsub_v8f32a(<8 x float> %a) {
111; SSE_SLOW-LABEL: hsub_v8f32a:
112; SSE_SLOW:       # %bb.0:
113; SSE_SLOW-NEXT:    movaps %xmm0, %xmm2
114; SSE_SLOW-NEXT:    hsubps %xmm1, %xmm2
115; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
116; SSE_SLOW-NEXT:    movaps %xmm2, %xmm1
117; SSE_SLOW-NEXT:    retq
118;
119; SSE_FAST-LABEL: hsub_v8f32a:
120; SSE_FAST:       # %bb.0:
121; SSE_FAST-NEXT:    movaps %xmm0, %xmm2
122; SSE_FAST-NEXT:    hsubps %xmm1, %xmm2
123; SSE_FAST-NEXT:    hsubps %xmm0, %xmm0
124; SSE_FAST-NEXT:    movaps %xmm2, %xmm1
125; SSE_FAST-NEXT:    retq
126;
127; AVX1_SLOW-LABEL: hsub_v8f32a:
128; AVX1_SLOW:       # %bb.0:
129; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
130; AVX1_SLOW-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
131; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
132; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
133; AVX1_SLOW-NEXT:    retq
134;
135; AVX1_FAST-LABEL: hsub_v8f32a:
136; AVX1_FAST:       # %bb.0:
137; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
138; AVX1_FAST-NEXT:    vhsubps %ymm0, %ymm1, %ymm0
139; AVX1_FAST-NEXT:    retq
140;
141; AVX2-LABEL: hsub_v8f32a:
142; AVX2:       # %bb.0:
143; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
144; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
145; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
146; AVX2-NEXT:    retq
147  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
148  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
149  %hop = fsub <4 x float> %a0, %a1
150  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
151  ret <8 x float> %shuf
152}
153
154define <8 x float> @hsub_v8f32b(<8 x float> %a) {
155; SSE-LABEL: hsub_v8f32b:
156; SSE:       # %bb.0:
157; SSE-NEXT:    hsubps %xmm0, %xmm0
158; SSE-NEXT:    hsubps %xmm1, %xmm1
159; SSE-NEXT:    retq
160;
161; AVX-LABEL: hsub_v8f32b:
162; AVX:       # %bb.0:
163; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
164; AVX-NEXT:    retq
165  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
166  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
167  %hop = fsub <8 x float> %a0, %a1
168  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
169  ret <8 x float> %shuf
170}
171
172define <2 x double> @hadd_v2f64(<2 x double> %a) {
173; SSE_SLOW-LABEL: hadd_v2f64:
174; SSE_SLOW:       # %bb.0:
175; SSE_SLOW-NEXT:    movapd %xmm0, %xmm1
176; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
177; SSE_SLOW-NEXT:    addsd %xmm0, %xmm1
178; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
179; SSE_SLOW-NEXT:    retq
180;
181; SSE_FAST-LABEL: hadd_v2f64:
182; SSE_FAST:       # %bb.0:
183; SSE_FAST-NEXT:    haddpd %xmm0, %xmm0
184; SSE_FAST-NEXT:    retq
185;
186; AVX1_SLOW-LABEL: hadd_v2f64:
187; AVX1_SLOW:       # %bb.0:
188; AVX1_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
189; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
190; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
191; AVX1_SLOW-NEXT:    retq
192;
193; AVX1_FAST-LABEL: hadd_v2f64:
194; AVX1_FAST:       # %bb.0:
195; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
196; AVX1_FAST-NEXT:    retq
197;
198; AVX2_SLOW-LABEL: hadd_v2f64:
199; AVX2_SLOW:       # %bb.0:
200; AVX2_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
201; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
202; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
203; AVX2_SLOW-NEXT:    retq
204;
205; AVX2_FAST-LABEL: hadd_v2f64:
206; AVX2_FAST:       # %bb.0:
207; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
208; AVX2_FAST-NEXT:    retq
209  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
210  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
211  %hop = fadd <2 x double> %a0, %a1
212  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
213  ret <2 x double> %shuf
214}
215
216define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
217; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat:
218; SSE_SLOW:       # %bb.0:
219; SSE_SLOW-NEXT:    movapd %xmm0, %xmm1
220; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
221; SSE_SLOW-NEXT:    addsd %xmm0, %xmm1
222; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
223; SSE_SLOW-NEXT:    retq
224;
225; SSE_FAST-LABEL: hadd_v2f64_scalar_splat:
226; SSE_FAST:       # %bb.0:
227; SSE_FAST-NEXT:    haddpd %xmm0, %xmm0
228; SSE_FAST-NEXT:    retq
229;
230; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
231; AVX1_SLOW:       # %bb.0:
232; AVX1_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
233; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
234; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
235; AVX1_SLOW-NEXT:    retq
236;
237; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat:
238; AVX1_FAST:       # %bb.0:
239; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
240; AVX1_FAST-NEXT:    retq
241;
242; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat:
243; AVX2_SLOW:       # %bb.0:
244; AVX2_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
245; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
246; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
247; AVX2_SLOW-NEXT:    retq
248;
249; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat:
250; AVX2_FAST:       # %bb.0:
251; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
252; AVX2_FAST-NEXT:    retq
253  %a0 = extractelement <2 x double> %a, i32 0
254  %a1 = extractelement <2 x double> %a, i32 1
255  %hop = fadd double %a0, %a1
256  %ins = insertelement <2 x double> undef, double %hop, i32 0
257  %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0>
258  ret <2 x double> %shuf
259}
260
261define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
262; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat:
263; SSE_SLOW:       # %bb.0:
264; SSE_SLOW-NEXT:    movapd %xmm0, %xmm2
265; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
266; SSE_SLOW-NEXT:    addsd %xmm0, %xmm2
267; SSE_SLOW-NEXT:    movapd %xmm1, %xmm3
268; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
269; SSE_SLOW-NEXT:    addsd %xmm1, %xmm3
270; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
271; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
272; SSE_SLOW-NEXT:    retq
273;
274; SSE_FAST-LABEL: hadd_v4f64_scalar_splat:
275; SSE_FAST:       # %bb.0:
276; SSE_FAST-NEXT:    haddpd %xmm0, %xmm0
277; SSE_FAST-NEXT:    haddpd %xmm1, %xmm1
278; SSE_FAST-NEXT:    retq
279;
280; AVX-LABEL: hadd_v4f64_scalar_splat:
281; AVX:       # %bb.0:
282; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
283; AVX-NEXT:    retq
284  %a0 = extractelement <4 x double> %a, i32 0
285  %a1 = extractelement <4 x double> %a, i32 1
286  %hop0 = fadd double %a0, %a1
287  %a2 = extractelement <4 x double> %a, i32 2
288  %a3 = extractelement <4 x double> %a, i32 3
289  %hop1 = fadd double %a2, %a3
290  %ins = insertelement <4 x double> undef, double %hop0, i32 0
291  %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
292  %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
293  ret <4 x double> %shuf
294}
295
296define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
297; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
298; SSE_SLOW:       # %bb.0:
299; SSE_SLOW-NEXT:    movapd %xmm0, %xmm1
300; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
301; SSE_SLOW-NEXT:    addsd %xmm0, %xmm1
302; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
303; SSE_SLOW-NEXT:    movapd %xmm0, %xmm1
304; SSE_SLOW-NEXT:    retq
305;
306; SSE_FAST-LABEL: hadd_v4f64_scalar_broadcast:
307; SSE_FAST:       # %bb.0:
308; SSE_FAST-NEXT:    haddpd %xmm0, %xmm0
309; SSE_FAST-NEXT:    movapd %xmm0, %xmm1
310; SSE_FAST-NEXT:    retq
311;
312; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
313; AVX1_SLOW:       # %bb.0:
314; AVX1_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
315; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
316; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
317; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
318; AVX1_SLOW-NEXT:    retq
319;
320; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast:
321; AVX1_FAST:       # %bb.0:
322; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
323; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
324; AVX1_FAST-NEXT:    retq
325;
326; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
327; AVX2_SLOW:       # %bb.0:
328; AVX2_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
329; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
330; AVX2_SLOW-NEXT:    vbroadcastsd %xmm0, %ymm0
331; AVX2_SLOW-NEXT:    retq
332;
333; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast:
334; AVX2_FAST:       # %bb.0:
335; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
336; AVX2_FAST-NEXT:    vbroadcastsd %xmm0, %ymm0
337; AVX2_FAST-NEXT:    retq
338  %a0 = extractelement <4 x double> %a, i32 0
339  %a1 = extractelement <4 x double> %a, i32 1
340  %hop0 = fadd double %a0, %a1
341  %a2 = extractelement <4 x double> %a, i32 2
342  %a3 = extractelement <4 x double> %a, i32 3
343  %hop1 = fadd double %a2, %a3
344  %ins = insertelement <4 x double> undef, double %hop0, i32 0
345  %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
346  %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
347  ret <4 x double> %shuf
348}
349
350define <4 x double> @hadd_v4f64(<4 x double> %a) {
351; SSE_SLOW-LABEL: hadd_v4f64:
352; SSE_SLOW:       # %bb.0:
353; SSE_SLOW-NEXT:    movapd %xmm0, %xmm2
354; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
355; SSE_SLOW-NEXT:    addsd %xmm0, %xmm2
356; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
357; SSE_SLOW-NEXT:    movapd %xmm1, %xmm2
358; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
359; SSE_SLOW-NEXT:    addsd %xmm1, %xmm2
360; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
361; SSE_SLOW-NEXT:    retq
362;
363; SSE_FAST-LABEL: hadd_v4f64:
364; SSE_FAST:       # %bb.0:
365; SSE_FAST-NEXT:    haddpd %xmm0, %xmm0
366; SSE_FAST-NEXT:    haddpd %xmm1, %xmm1
367; SSE_FAST-NEXT:    retq
368;
369; AVX-LABEL: hadd_v4f64:
370; AVX:       # %bb.0:
371; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
372; AVX-NEXT:    retq
373  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
374  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
375  %hop = fadd <4 x double> %a0, %a1
376  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
377  ret <4 x double> %shuf
378}
379
380define <2 x double> @hsub_v2f64(<2 x double> %a) {
381; SSE_SLOW-LABEL: hsub_v2f64:
382; SSE_SLOW:       # %bb.0:
383; SSE_SLOW-NEXT:    movapd %xmm0, %xmm1
384; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
385; SSE_SLOW-NEXT:    subsd %xmm1, %xmm0
386; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
387; SSE_SLOW-NEXT:    retq
388;
389; SSE_FAST-LABEL: hsub_v2f64:
390; SSE_FAST:       # %bb.0:
391; SSE_FAST-NEXT:    hsubpd %xmm0, %xmm0
392; SSE_FAST-NEXT:    retq
393;
394; AVX1_SLOW-LABEL: hsub_v2f64:
395; AVX1_SLOW:       # %bb.0:
396; AVX1_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
397; AVX1_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
398; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
399; AVX1_SLOW-NEXT:    retq
400;
401; AVX1_FAST-LABEL: hsub_v2f64:
402; AVX1_FAST:       # %bb.0:
403; AVX1_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
404; AVX1_FAST-NEXT:    retq
405;
406; AVX2_SLOW-LABEL: hsub_v2f64:
407; AVX2_SLOW:       # %bb.0:
408; AVX2_SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
409; AVX2_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
410; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
411; AVX2_SLOW-NEXT:    retq
412;
413; AVX2_FAST-LABEL: hsub_v2f64:
414; AVX2_FAST:       # %bb.0:
415; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
416; AVX2_FAST-NEXT:    retq
417  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
418  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
419  %hop = fsub <2 x double> %a0, %a1
420  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
421  ret <2 x double> %shuf
422}
423
424define <4 x double> @hsub_v4f64(<4 x double> %a) {
425; SSE_SLOW-LABEL: hsub_v4f64:
426; SSE_SLOW:       # %bb.0:
427; SSE_SLOW-NEXT:    movapd %xmm0, %xmm2
428; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
429; SSE_SLOW-NEXT:    subsd %xmm2, %xmm0
430; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
431; SSE_SLOW-NEXT:    movapd %xmm1, %xmm2
432; SSE_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
433; SSE_SLOW-NEXT:    subsd %xmm2, %xmm1
434; SSE_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
435; SSE_SLOW-NEXT:    retq
436;
437; SSE_FAST-LABEL: hsub_v4f64:
438; SSE_FAST:       # %bb.0:
439; SSE_FAST-NEXT:    hsubpd %xmm0, %xmm0
440; SSE_FAST-NEXT:    hsubpd %xmm1, %xmm1
441; SSE_FAST-NEXT:    retq
442;
443; AVX-LABEL: hsub_v4f64:
444; AVX:       # %bb.0:
445; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
446; AVX-NEXT:    retq
447  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
448  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
449  %hop = fsub <4 x double> %a0, %a1
450  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
451  ret <4 x double> %shuf
452}
453
454define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
455; SSE3-LABEL: hadd_v4i32:
456; SSE3:       # %bb.0:
457; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
458; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
459; SSE3-NEXT:    paddd %xmm1, %xmm0
460; SSE3-NEXT:    retq
461;
462; SSSE3-LABEL: hadd_v4i32:
463; SSSE3:       # %bb.0:
464; SSSE3-NEXT:    phaddd %xmm0, %xmm0
465; SSSE3-NEXT:    retq
466;
467; AVX-LABEL: hadd_v4i32:
468; AVX:       # %bb.0:
469; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
470; AVX-NEXT:    retq
471  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
472  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
473  %hop = add <4 x i32> %a02, %a13
474  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
475  ret <4 x i32> %shuf
476}
477
478define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
479; SSE3-LABEL: hadd_v8i32a:
480; SSE3:       # %bb.0:
481; SSE3-NEXT:    movaps %xmm0, %xmm2
482; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
483; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
484; SSE3-NEXT:    paddd %xmm0, %xmm2
485; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
486; SSE3-NEXT:    movdqa %xmm2, %xmm1
487; SSE3-NEXT:    retq
488;
489; SSSE3_SLOW-LABEL: hadd_v8i32a:
490; SSSE3_SLOW:       # %bb.0:
491; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
492; SSSE3_SLOW-NEXT:    phaddd %xmm1, %xmm2
493; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
494; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
495; SSSE3_SLOW-NEXT:    retq
496;
497; SSSE3_FAST-LABEL: hadd_v8i32a:
498; SSSE3_FAST:       # %bb.0:
499; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
500; SSSE3_FAST-NEXT:    phaddd %xmm1, %xmm2
501; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
502; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
503; SSSE3_FAST-NEXT:    retq
504;
505; AVX1_SLOW-LABEL: hadd_v8i32a:
506; AVX1_SLOW:       # %bb.0:
507; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
508; AVX1_SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
509; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
510; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
511; AVX1_SLOW-NEXT:    retq
512;
513; AVX1_FAST-LABEL: hadd_v8i32a:
514; AVX1_FAST:       # %bb.0:
515; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
516; AVX1_FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm1
517; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
518; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
519; AVX1_FAST-NEXT:    retq
520;
521; AVX2-LABEL: hadd_v8i32a:
522; AVX2:       # %bb.0:
523; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
524; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
525; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
526; AVX2-NEXT:    retq
527  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
528  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
529  %hop = add <4 x i32> %a0, %a1
530  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
531  ret <8 x i32> %shuf
532}
533
534define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
535; SSE3-LABEL: hadd_v8i32b:
536; SSE3:       # %bb.0:
537; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
538; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
539; SSE3-NEXT:    paddd %xmm2, %xmm0
540; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
541; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
542; SSE3-NEXT:    paddd %xmm2, %xmm1
543; SSE3-NEXT:    retq
544;
545; SSSE3-LABEL: hadd_v8i32b:
546; SSSE3:       # %bb.0:
547; SSSE3-NEXT:    phaddd %xmm0, %xmm0
548; SSSE3-NEXT:    phaddd %xmm1, %xmm1
549; SSSE3-NEXT:    retq
550;
551; AVX1-LABEL: hadd_v8i32b:
552; AVX1:       # %bb.0:
553; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
554; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
555; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
556; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
557; AVX1-NEXT:    retq
558;
559; AVX2-LABEL: hadd_v8i32b:
560; AVX2:       # %bb.0:
561; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
562; AVX2-NEXT:    retq
563  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
564  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
565  %hop = add <8 x i32> %a0, %a1
566  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
567  ret <8 x i32> %shuf
568}
569
570define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
571; SSE3-LABEL: hsub_v4i32:
572; SSE3:       # %bb.0:
573; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,3,1,3]
574; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
575; SSE3-NEXT:    psubd %xmm1, %xmm0
576; SSE3-NEXT:    retq
577;
578; SSSE3-LABEL: hsub_v4i32:
579; SSSE3:       # %bb.0:
580; SSSE3-NEXT:    phsubd %xmm0, %xmm0
581; SSSE3-NEXT:    retq
582;
583; AVX-LABEL: hsub_v4i32:
584; AVX:       # %bb.0:
585; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
586; AVX-NEXT:    retq
587  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
588  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
589  %hop = sub <4 x i32> %a02, %a13
590  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
591  ret <4 x i32> %shuf
592}
593
594define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
595; SSE3-LABEL: hsub_v8i32a:
596; SSE3:       # %bb.0:
597; SSE3-NEXT:    movaps %xmm0, %xmm2
598; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
599; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
600; SSE3-NEXT:    psubd %xmm0, %xmm2
601; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
602; SSE3-NEXT:    movdqa %xmm2, %xmm1
603; SSE3-NEXT:    retq
604;
605; SSSE3_SLOW-LABEL: hsub_v8i32a:
606; SSSE3_SLOW:       # %bb.0:
607; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
608; SSSE3_SLOW-NEXT:    phsubd %xmm1, %xmm2
609; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
610; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
611; SSSE3_SLOW-NEXT:    retq
612;
613; SSSE3_FAST-LABEL: hsub_v8i32a:
614; SSSE3_FAST:       # %bb.0:
615; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
616; SSSE3_FAST-NEXT:    phsubd %xmm1, %xmm2
617; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
618; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
619; SSSE3_FAST-NEXT:    retq
620;
621; AVX1_SLOW-LABEL: hsub_v8i32a:
622; AVX1_SLOW:       # %bb.0:
623; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
624; AVX1_SLOW-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
625; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
626; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
627; AVX1_SLOW-NEXT:    retq
628;
629; AVX1_FAST-LABEL: hsub_v8i32a:
630; AVX1_FAST:       # %bb.0:
631; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
632; AVX1_FAST-NEXT:    vphsubd %xmm1, %xmm0, %xmm1
633; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
634; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
635; AVX1_FAST-NEXT:    retq
636;
637; AVX2-LABEL: hsub_v8i32a:
638; AVX2:       # %bb.0:
639; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
640; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
641; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
642; AVX2-NEXT:    retq
643  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
644  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
645  %hop = sub <4 x i32> %a0, %a1
646  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
647  ret <8 x i32> %shuf
648}
649
650define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
651; SSE3-LABEL: hsub_v8i32b:
652; SSE3:       # %bb.0:
653; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
654; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
655; SSE3-NEXT:    psubd %xmm2, %xmm0
656; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
657; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
658; SSE3-NEXT:    psubd %xmm2, %xmm1
659; SSE3-NEXT:    retq
660;
661; SSSE3-LABEL: hsub_v8i32b:
662; SSSE3:       # %bb.0:
663; SSSE3-NEXT:    phsubd %xmm0, %xmm0
664; SSSE3-NEXT:    phsubd %xmm1, %xmm1
665; SSSE3-NEXT:    retq
666;
667; AVX1-LABEL: hsub_v8i32b:
668; AVX1:       # %bb.0:
669; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
670; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
671; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
672; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
673; AVX1-NEXT:    retq
674;
675; AVX2-LABEL: hsub_v8i32b:
676; AVX2:       # %bb.0:
677; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
678; AVX2-NEXT:    retq
679  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
680  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
681  %hop = sub <8 x i32> %a0, %a1
682  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
683  ret <8 x i32> %shuf
684}
685
686define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
687; SSE3-LABEL: hadd_v8i16:
688; SSE3:       # %bb.0:
689; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
690; SSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
691; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
692; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
693; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
694; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
695; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
696; SSE3-NEXT:    paddw %xmm1, %xmm0
697; SSE3-NEXT:    retq
698;
699; SSSE3-LABEL: hadd_v8i16:
700; SSSE3:       # %bb.0:
701; SSSE3-NEXT:    phaddw %xmm0, %xmm0
702; SSSE3-NEXT:    retq
703;
704; AVX-LABEL: hadd_v8i16:
705; AVX:       # %bb.0:
706; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
707; AVX-NEXT:    retq
708  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
709  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
710  %hop = add <8 x i16> %a0246, %a1357
711  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
712  ret <8 x i16> %shuf
713}
714
715define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
716; SSE3-LABEL: hadd_v16i16a:
717; SSE3:       # %bb.0:
718; SSE3-NEXT:    movdqa %xmm1, %xmm3
719; SSE3-NEXT:    pslld $16, %xmm3
720; SSE3-NEXT:    psrad $16, %xmm3
721; SSE3-NEXT:    movdqa %xmm0, %xmm2
722; SSE3-NEXT:    pslld $16, %xmm2
723; SSE3-NEXT:    psrad $16, %xmm2
724; SSE3-NEXT:    packssdw %xmm3, %xmm2
725; SSE3-NEXT:    psrad $16, %xmm1
726; SSE3-NEXT:    psrad $16, %xmm0
727; SSE3-NEXT:    packssdw %xmm1, %xmm0
728; SSE3-NEXT:    paddw %xmm0, %xmm2
729; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
730; SSE3-NEXT:    movdqa %xmm2, %xmm1
731; SSE3-NEXT:    retq
732;
733; SSSE3_SLOW-LABEL: hadd_v16i16a:
734; SSSE3_SLOW:       # %bb.0:
735; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
736; SSSE3_SLOW-NEXT:    phaddw %xmm1, %xmm2
737; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
738; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
739; SSSE3_SLOW-NEXT:    retq
740;
741; SSSE3_FAST-LABEL: hadd_v16i16a:
742; SSSE3_FAST:       # %bb.0:
743; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
744; SSSE3_FAST-NEXT:    phaddw %xmm1, %xmm2
745; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
746; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
747; SSSE3_FAST-NEXT:    retq
748;
749; AVX1_SLOW-LABEL: hadd_v16i16a:
750; AVX1_SLOW:       # %bb.0:
751; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
752; AVX1_SLOW-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
753; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
754; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
755; AVX1_SLOW-NEXT:    retq
756;
757; AVX1_FAST-LABEL: hadd_v16i16a:
758; AVX1_FAST:       # %bb.0:
759; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
760; AVX1_FAST-NEXT:    vphaddw %xmm1, %xmm0, %xmm1
761; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
762; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
763; AVX1_FAST-NEXT:    retq
764;
765; AVX2-LABEL: hadd_v16i16a:
766; AVX2:       # %bb.0:
767; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
768; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
769; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
770; AVX2-NEXT:    retq
771  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
772  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
773  %hop = add <8 x i16> %a0, %a1
774  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
775  ret <16 x i16> %shuf
776}
777
778define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
779; SSE3-LABEL: hadd_v16i16b:
780; SSE3:       # %bb.0:
781; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
782; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
783; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
784; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
785; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
786; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
787; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
788; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
789; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
790; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
791; SSE3-NEXT:    paddw %xmm2, %xmm0
792; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
793; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
794; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
795; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
796; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
797; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
798; SSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
799; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
800; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
801; SSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
802; SSE3-NEXT:    paddw %xmm2, %xmm1
803; SSE3-NEXT:    retq
804;
805; SSSE3-LABEL: hadd_v16i16b:
806; SSSE3:       # %bb.0:
807; SSSE3-NEXT:    phaddw %xmm0, %xmm0
808; SSSE3-NEXT:    phaddw %xmm1, %xmm1
809; SSSE3-NEXT:    retq
810;
811; AVX1-LABEL: hadd_v16i16b:
812; AVX1:       # %bb.0:
813; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
814; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
815; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
816; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
817; AVX1-NEXT:    retq
818;
819; AVX2-LABEL: hadd_v16i16b:
820; AVX2:       # %bb.0:
821; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
822; AVX2-NEXT:    retq
823  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
824  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
825  %hop = add <16 x i16> %a0, %a1
826  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
827  ret <16 x i16> %shuf
828}
829
830define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
831; SSE3-LABEL: hsub_v8i16:
832; SSE3:       # %bb.0:
833; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
834; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
835; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
836; SSE3-NEXT:    psubw %xmm1, %xmm0
837; SSE3-NEXT:    retq
838;
839; SSSE3-LABEL: hsub_v8i16:
840; SSSE3:       # %bb.0:
841; SSSE3-NEXT:    phsubw %xmm0, %xmm0
842; SSSE3-NEXT:    retq
843;
844; AVX-LABEL: hsub_v8i16:
845; AVX:       # %bb.0:
846; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
847; AVX-NEXT:    retq
848  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
849  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
850  %hop = sub <8 x i16> %a0246, %a1357
851  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
852  ret <8 x i16> %shuf
853}
854
855define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
856; SSE3-LABEL: hsub_v16i16a:
857; SSE3:       # %bb.0:
858; SSE3-NEXT:    movdqa %xmm1, %xmm3
859; SSE3-NEXT:    pslld $16, %xmm3
860; SSE3-NEXT:    psrad $16, %xmm3
861; SSE3-NEXT:    movdqa %xmm0, %xmm2
862; SSE3-NEXT:    pslld $16, %xmm2
863; SSE3-NEXT:    psrad $16, %xmm2
864; SSE3-NEXT:    packssdw %xmm3, %xmm2
865; SSE3-NEXT:    psrad $16, %xmm1
866; SSE3-NEXT:    psrad $16, %xmm0
867; SSE3-NEXT:    packssdw %xmm1, %xmm0
868; SSE3-NEXT:    psubw %xmm0, %xmm2
869; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
870; SSE3-NEXT:    movdqa %xmm2, %xmm1
871; SSE3-NEXT:    retq
872;
873; SSSE3_SLOW-LABEL: hsub_v16i16a:
874; SSSE3_SLOW:       # %bb.0:
875; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
876; SSSE3_SLOW-NEXT:    phsubw %xmm1, %xmm2
877; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
878; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
879; SSSE3_SLOW-NEXT:    retq
880;
881; SSSE3_FAST-LABEL: hsub_v16i16a:
882; SSSE3_FAST:       # %bb.0:
883; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
884; SSSE3_FAST-NEXT:    phsubw %xmm1, %xmm2
885; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
886; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
887; SSSE3_FAST-NEXT:    retq
888;
889; AVX1_SLOW-LABEL: hsub_v16i16a:
890; AVX1_SLOW:       # %bb.0:
891; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
892; AVX1_SLOW-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
893; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
894; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
895; AVX1_SLOW-NEXT:    retq
896;
897; AVX1_FAST-LABEL: hsub_v16i16a:
898; AVX1_FAST:       # %bb.0:
899; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
900; AVX1_FAST-NEXT:    vphsubw %xmm1, %xmm0, %xmm1
901; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
902; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
903; AVX1_FAST-NEXT:    retq
904;
905; AVX2-LABEL: hsub_v16i16a:
906; AVX2:       # %bb.0:
907; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
908; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
909; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
910; AVX2-NEXT:    retq
911  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
912  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
913  %hop = sub <8 x i16> %a0, %a1
914  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
915  ret <16 x i16> %shuf
916}
917
918define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
919; SSE3-LABEL: hsub_v16i16b:
920; SSE3:       # %bb.0:
921; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
922; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
923; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
924; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
925; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
926; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
927; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
928; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
929; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
930; SSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
931; SSE3-NEXT:    psubw %xmm2, %xmm0
932; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
933; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
934; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
935; SSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
936; SSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
937; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
938; SSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
939; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
940; SSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
941; SSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
942; SSE3-NEXT:    psubw %xmm2, %xmm1
943; SSE3-NEXT:    retq
944;
945; SSSE3-LABEL: hsub_v16i16b:
946; SSSE3:       # %bb.0:
947; SSSE3-NEXT:    phsubw %xmm0, %xmm0
948; SSSE3-NEXT:    phsubw %xmm1, %xmm1
949; SSSE3-NEXT:    retq
950;
951; AVX1-LABEL: hsub_v16i16b:
952; AVX1:       # %bb.0:
953; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
954; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
955; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
956; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
957; AVX1-NEXT:    retq
958;
959; AVX2-LABEL: hsub_v16i16b:
960; AVX2:       # %bb.0:
961; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
962; AVX2-NEXT:    retq
963  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
964  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
965  %hop = sub <16 x i16> %a0, %a1
966  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
967  ret <16 x i16> %shuf
968}
969
970define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
971; SSE-LABEL: broadcast_haddps_v4f32:
972; SSE:       # %bb.0:
973; SSE-NEXT:    haddps %xmm0, %xmm0
974; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
975; SSE-NEXT:    retq
976;
977; AVX1-LABEL: broadcast_haddps_v4f32:
978; AVX1:       # %bb.0:
979; AVX1-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
980; AVX1-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
981; AVX1-NEXT:    retq
982;
983; AVX2-LABEL: broadcast_haddps_v4f32:
984; AVX2:       # %bb.0:
985; AVX2-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
986; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
987; AVX2-NEXT:    retq
988  %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
989  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
990  ret <4 x float> %2
991}
992
993declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
994
995define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
996; SSE-LABEL: PR34724_1:
997; SSE:       # %bb.0:
998; SSE-NEXT:    haddps %xmm1, %xmm0
999; SSE-NEXT:    retq
1000;
1001; AVX-LABEL: PR34724_1:
1002; AVX:       # %bb.0:
1003; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1004; AVX-NEXT:    retq
1005  %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
1006  %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
1007  %t2 = fadd <2 x float> %t0, %t1
1008  %vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1009  %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1010  %t4 = fadd <4 x float> %t3, %b
1011  %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1012  ret <4 x float> %vecinit13
1013}
1014
1015define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
1016; SSE-LABEL: PR34724_2:
1017; SSE:       # %bb.0:
1018; SSE-NEXT:    haddps %xmm1, %xmm0
1019; SSE-NEXT:    retq
1020;
1021; AVX-LABEL: PR34724_2:
1022; AVX:       # %bb.0:
1023; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1024; AVX-NEXT:    retq
1025  %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
1026  %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
1027  %t2 = fadd <4 x float> %t0, %t1
1028  %vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1029  %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1030  %t4 = fadd <4 x float> %t3, %b
1031  %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1032  ret <4 x float> %vecinit13
1033}
1034
1035;
1036; fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
1037;  --> SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))).
1038;
1039
1040define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
1041; SSE-LABEL: hadd_4f32_v8f32_shuffle:
1042; SSE:       # %bb.0:
1043; SSE-NEXT:    haddps %xmm1, %xmm0
1044; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1045; SSE-NEXT:    retq
1046;
1047; AVX-LABEL: hadd_4f32_v8f32_shuffle:
1048; AVX:       # %bb.0:
1049; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1050; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1051; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1052; AVX-NEXT:    vzeroupper
1053; AVX-NEXT:    retq
1054  %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1055  %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1056  %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1057  %hadd0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1058  %hadd1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1059  %hadd = fadd <4 x float> %hadd0, %hadd1
1060  ret <4 x float> %hadd
1061}
1062
1063define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
1064; SSE-LABEL: hsub_4f32_v8f32_shuffle:
1065; SSE:       # %bb.0:
1066; SSE-NEXT:    haddps %xmm1, %xmm0
1067; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1068; SSE-NEXT:    retq
1069;
1070; AVX-LABEL: hsub_4f32_v8f32_shuffle:
1071; AVX:       # %bb.0:
1072; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1073; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1074; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1075; AVX-NEXT:    vzeroupper
1076; AVX-NEXT:    retq
1077  %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1078  %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1079  %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1080  %hsub0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1081  %hsub1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1082  %hsub = fadd <4 x float> %hsub0, %hsub1
1083  ret <4 x float> %hsub
1084}
1085
1086define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
1087; SSE3-LABEL: hadd_4i32_v8i32_shuffle:
1088; SSE3:       # %bb.0:
1089; SSE3-NEXT:    movaps %xmm0, %xmm2
1090; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
1091; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
1092; SSE3-NEXT:    paddd %xmm2, %xmm0
1093; SSE3-NEXT:    retq
1094;
1095; SSSE3-LABEL: hadd_4i32_v8i32_shuffle:
1096; SSSE3:       # %bb.0:
1097; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1098; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1099; SSSE3-NEXT:    retq
1100;
1101; AVX1-LABEL: hadd_4i32_v8i32_shuffle:
1102; AVX1:       # %bb.0:
1103; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1104; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1105; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1106; AVX1-NEXT:    vzeroupper
1107; AVX1-NEXT:    retq
1108;
1109; AVX2-LABEL: hadd_4i32_v8i32_shuffle:
1110; AVX2:       # %bb.0:
1111; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1112; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1113; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1114; AVX2-NEXT:    vzeroupper
1115; AVX2-NEXT:    retq
1116  %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1117  %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1118  %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1119  %hadd0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1120  %hadd1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1121  %hadd = add <4 x i32> %hadd0, %hadd1
1122  ret <4 x i32> %hadd
1123}
1124
1125define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
1126; SSE3-LABEL: hsub_4i32_v8i32_shuffle:
1127; SSE3:       # %bb.0:
1128; SSE3-NEXT:    movaps %xmm0, %xmm2
1129; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
1130; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
1131; SSE3-NEXT:    paddd %xmm2, %xmm0
1132; SSE3-NEXT:    retq
1133;
1134; SSSE3-LABEL: hsub_4i32_v8i32_shuffle:
1135; SSSE3:       # %bb.0:
1136; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1137; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1138; SSSE3-NEXT:    retq
1139;
1140; AVX1-LABEL: hsub_4i32_v8i32_shuffle:
1141; AVX1:       # %bb.0:
1142; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1143; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1144; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1145; AVX1-NEXT:    vzeroupper
1146; AVX1-NEXT:    retq
1147;
1148; AVX2-LABEL: hsub_4i32_v8i32_shuffle:
1149; AVX2:       # %bb.0:
1150; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1151; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1152; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1153; AVX2-NEXT:    vzeroupper
1154; AVX2-NEXT:    retq
1155  %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1156  %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1157  %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1158  %hsub0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1159  %hsub1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1160  %hsub = add <4 x i32> %hsub0, %hsub1
1161  ret <4 x i32> %hsub
1162}
1163
1164;
1165; fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) --> SHUFFLE(HOP(X,Y)).
1166;
1167
1168define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1169; SSE-LABEL: hadd_4f64_v4f64_shuffle:
1170; SSE:       # %bb.0:
1171; SSE-NEXT:    haddpd %xmm1, %xmm0
1172; SSE-NEXT:    haddpd %xmm3, %xmm2
1173; SSE-NEXT:    movapd %xmm2, %xmm1
1174; SSE-NEXT:    retq
1175;
1176; AVX1-LABEL: hadd_4f64_v4f64_shuffle:
1177; AVX1:       # %bb.0:
1178; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1179; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1180; AVX1-NEXT:    vhaddpd %ymm0, %ymm2, %ymm0
1181; AVX1-NEXT:    retq
1182;
1183; AVX2-LABEL: hadd_4f64_v4f64_shuffle:
1184; AVX2:       # %bb.0:
1185; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1186; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1187; AVX2-NEXT:    retq
1188  %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1189  %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1190  %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1191  %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1192  %hadd = fadd <4 x double> %hadd0, %hadd1
1193  ret <4 x double> %hadd
1194}
1195
1196define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1197; SSE-LABEL: hsub_4f64_v4f64_shuffle:
1198; SSE:       # %bb.0:
1199; SSE-NEXT:    hsubpd %xmm1, %xmm0
1200; SSE-NEXT:    hsubpd %xmm3, %xmm2
1201; SSE-NEXT:    movapd %xmm2, %xmm1
1202; SSE-NEXT:    retq
1203;
1204; AVX1-LABEL: hsub_4f64_v4f64_shuffle:
1205; AVX1:       # %bb.0:
1206; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1207; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1208; AVX1-NEXT:    vhsubpd %ymm0, %ymm2, %ymm0
1209; AVX1-NEXT:    retq
1210;
1211; AVX2-LABEL: hsub_4f64_v4f64_shuffle:
1212; AVX2:       # %bb.0:
1213; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1214; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1215; AVX2-NEXT:    retq
1216  %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1217  %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1218  %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1219  %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1220  %hadd = fsub <4 x double> %hadd0, %hadd1
1221  ret <4 x double> %hadd
1222}
1223
1224define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1225; SSE-LABEL: hadd_8f32_v8f32_shuffle:
1226; SSE:       # %bb.0:
1227; SSE-NEXT:    haddps %xmm1, %xmm0
1228; SSE-NEXT:    haddps %xmm3, %xmm2
1229; SSE-NEXT:    movaps %xmm2, %xmm1
1230; SSE-NEXT:    retq
1231;
1232; AVX1-LABEL: hadd_8f32_v8f32_shuffle:
1233; AVX1:       # %bb.0:
1234; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1235; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1236; AVX1-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
1237; AVX1-NEXT:    retq
1238;
1239; AVX2-LABEL: hadd_8f32_v8f32_shuffle:
1240; AVX2:       # %bb.0:
1241; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1242; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1243; AVX2-NEXT:    retq
1244  %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1245  %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1246  %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1247  %hadd1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1248  %hadd = fadd <8 x float> %hadd0, %hadd1
1249  ret <8 x float> %hadd
1250}
1251
1252define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1253; SSE-LABEL: hsub_8f32_v8f32_shuffle:
1254; SSE:       # %bb.0:
1255; SSE-NEXT:    haddps %xmm1, %xmm0
1256; SSE-NEXT:    haddps %xmm3, %xmm2
1257; SSE-NEXT:    movaps %xmm2, %xmm1
1258; SSE-NEXT:    retq
1259;
1260; AVX1-LABEL: hsub_8f32_v8f32_shuffle:
1261; AVX1:       # %bb.0:
1262; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1263; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1264; AVX1-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
1265; AVX1-NEXT:    retq
1266;
1267; AVX2-LABEL: hsub_8f32_v8f32_shuffle:
1268; AVX2:       # %bb.0:
1269; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1270; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1271; AVX2-NEXT:    retq
1272  %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1273  %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1274  %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1275  %hsub1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1276  %hsub = fadd <8 x float> %hsub0, %hsub1
1277  ret <8 x float> %hsub
1278}
1279
1280define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1281; SSE3-LABEL: hadd_8i32_v8i32_shuffle:
1282; SSE3:       # %bb.0:
1283; SSE3-NEXT:    movaps %xmm2, %xmm4
1284; SSE3-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
1285; SSE3-NEXT:    movaps %xmm0, %xmm5
1286; SSE3-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
1287; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
1288; SSE3-NEXT:    paddd %xmm2, %xmm4
1289; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1290; SSE3-NEXT:    paddd %xmm5, %xmm0
1291; SSE3-NEXT:    movdqa %xmm4, %xmm1
1292; SSE3-NEXT:    retq
1293;
1294; SSSE3-LABEL: hadd_8i32_v8i32_shuffle:
1295; SSSE3:       # %bb.0:
1296; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1297; SSSE3-NEXT:    phaddd %xmm3, %xmm2
1298; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1299; SSSE3-NEXT:    retq
1300;
1301; AVX1-LABEL: hadd_8i32_v8i32_shuffle:
1302; AVX1:       # %bb.0:
1303; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1304; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
1305; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1306; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
1307; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1308; AVX1-NEXT:    retq
1309;
1310; AVX2-LABEL: hadd_8i32_v8i32_shuffle:
1311; AVX2:       # %bb.0:
1312; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1313; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1314; AVX2-NEXT:    retq
1315  %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1316  %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1317  %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1318  %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1319  %hadd = add <8 x i32> %hadd0, %hadd1
1320  ret <8 x i32> %hadd
1321}
1322
1323define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1324; SSE3-LABEL: hsub_8i32_v8i32_shuffle:
1325; SSE3:       # %bb.0:
1326; SSE3-NEXT:    movaps %xmm2, %xmm4
1327; SSE3-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
1328; SSE3-NEXT:    movaps %xmm0, %xmm5
1329; SSE3-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
1330; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
1331; SSE3-NEXT:    psubd %xmm2, %xmm4
1332; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1333; SSE3-NEXT:    psubd %xmm0, %xmm5
1334; SSE3-NEXT:    movdqa %xmm5, %xmm0
1335; SSE3-NEXT:    movdqa %xmm4, %xmm1
1336; SSE3-NEXT:    retq
1337;
1338; SSSE3-LABEL: hsub_8i32_v8i32_shuffle:
1339; SSSE3:       # %bb.0:
1340; SSSE3-NEXT:    phsubd %xmm1, %xmm0
1341; SSSE3-NEXT:    phsubd %xmm3, %xmm2
1342; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1343; SSSE3-NEXT:    retq
1344;
1345; AVX1-LABEL: hsub_8i32_v8i32_shuffle:
1346; AVX1:       # %bb.0:
1347; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1348; AVX1-NEXT:    vphsubd %xmm2, %xmm1, %xmm1
1349; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1350; AVX1-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
1351; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1352; AVX1-NEXT:    retq
1353;
1354; AVX2-LABEL: hsub_8i32_v8i32_shuffle:
1355; AVX2:       # %bb.0:
1356; AVX2-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
1357; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1358; AVX2-NEXT:    retq
1359  %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1360  %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1361  %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1362  %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1363  %hadd = sub <8 x i32> %hadd0, %hadd1
1364  ret <8 x i32> %hadd
1365}
1366
1367define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
1368; SSE3-LABEL: hadd_16i16_16i16_shuffle:
1369; SSE3:       # %bb.0:
1370; SSE3-NEXT:    movdqa %xmm3, %xmm5
1371; SSE3-NEXT:    pslld $16, %xmm5
1372; SSE3-NEXT:    psrad $16, %xmm5
1373; SSE3-NEXT:    movdqa %xmm2, %xmm4
1374; SSE3-NEXT:    pslld $16, %xmm4
1375; SSE3-NEXT:    psrad $16, %xmm4
1376; SSE3-NEXT:    packssdw %xmm5, %xmm4
1377; SSE3-NEXT:    movdqa %xmm1, %xmm5
1378; SSE3-NEXT:    pslld $16, %xmm5
1379; SSE3-NEXT:    psrad $16, %xmm5
1380; SSE3-NEXT:    movdqa %xmm0, %xmm6
1381; SSE3-NEXT:    pslld $16, %xmm6
1382; SSE3-NEXT:    psrad $16, %xmm6
1383; SSE3-NEXT:    packssdw %xmm5, %xmm6
1384; SSE3-NEXT:    psrad $16, %xmm3
1385; SSE3-NEXT:    psrad $16, %xmm2
1386; SSE3-NEXT:    packssdw %xmm3, %xmm2
1387; SSE3-NEXT:    paddw %xmm2, %xmm4
1388; SSE3-NEXT:    psrad $16, %xmm1
1389; SSE3-NEXT:    psrad $16, %xmm0
1390; SSE3-NEXT:    packssdw %xmm1, %xmm0
1391; SSE3-NEXT:    paddw %xmm6, %xmm0
1392; SSE3-NEXT:    movdqa %xmm4, %xmm1
1393; SSE3-NEXT:    retq
1394;
1395; SSSE3-LABEL: hadd_16i16_16i16_shuffle:
1396; SSSE3:       # %bb.0:
1397; SSSE3-NEXT:    phaddw %xmm1, %xmm0
1398; SSSE3-NEXT:    phaddw %xmm3, %xmm2
1399; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1400; SSSE3-NEXT:    retq
1401;
1402; AVX1-LABEL: hadd_16i16_16i16_shuffle:
1403; AVX1:       # %bb.0:
1404; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1405; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
1406; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1407; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
1408; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1409; AVX1-NEXT:    retq
1410;
1411; AVX2-LABEL: hadd_16i16_16i16_shuffle:
1412; AVX2:       # %bb.0:
1413; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1414; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1415; AVX2-NEXT:    retq
1416  %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1417  %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1418  %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
1419  %hadd1 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
1420  %hadd = add <16 x i16> %hadd0, %hadd1
1421  ret <16 x i16> %hadd
1422}
1423