xref: /llvm-project/llvm/test/CodeGen/X86/haddsub-2.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
8; SSE-LABEL: hadd_ps_test1:
9; SSE:       # %bb.0:
10; SSE-NEXT:    haddps %xmm1, %xmm0
11; SSE-NEXT:    retq
12;
13; AVX-LABEL: hadd_ps_test1:
14; AVX:       # %bb.0:
15; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
16; AVX-NEXT:    retq
17  %vecext = extractelement <4 x float> %A, i32 0
18  %vecext1 = extractelement <4 x float> %A, i32 1
19  %add = fadd float %vecext, %vecext1
20  %vecinit = insertelement <4 x float> undef, float %add, i32 0
21  %vecext2 = extractelement <4 x float> %A, i32 2
22  %vecext3 = extractelement <4 x float> %A, i32 3
23  %add4 = fadd float %vecext2, %vecext3
24  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
25  %vecext6 = extractelement <4 x float> %B, i32 0
26  %vecext7 = extractelement <4 x float> %B, i32 1
27  %add8 = fadd float %vecext6, %vecext7
28  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
29  %vecext10 = extractelement <4 x float> %B, i32 2
30  %vecext11 = extractelement <4 x float> %B, i32 3
31  %add12 = fadd float %vecext10, %vecext11
32  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
33  ret <4 x float> %vecinit13
34}
35
36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
37; SSE-LABEL: hadd_ps_test2:
38; SSE:       # %bb.0:
39; SSE-NEXT:    haddps %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: hadd_ps_test2:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %vecext = extractelement <4 x float> %A, i32 2
47  %vecext1 = extractelement <4 x float> %A, i32 3
48  %add = fadd float %vecext, %vecext1
49  %vecinit = insertelement <4 x float> undef, float %add, i32 1
50  %vecext2 = extractelement <4 x float> %A, i32 0
51  %vecext3 = extractelement <4 x float> %A, i32 1
52  %add4 = fadd float %vecext2, %vecext3
53  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
54  %vecext6 = extractelement <4 x float> %B, i32 2
55  %vecext7 = extractelement <4 x float> %B, i32 3
56  %add8 = fadd float %vecext6, %vecext7
57  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
58  %vecext10 = extractelement <4 x float> %B, i32 0
59  %vecext11 = extractelement <4 x float> %B, i32 1
60  %add12 = fadd float %vecext10, %vecext11
61  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
62  ret <4 x float> %vecinit13
63}
64
65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
66; SSE-LABEL: hsub_ps_test1:
67; SSE:       # %bb.0:
68; SSE-NEXT:    hsubps %xmm1, %xmm0
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: hsub_ps_test1:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    retq
75  %vecext = extractelement <4 x float> %A, i32 0
76  %vecext1 = extractelement <4 x float> %A, i32 1
77  %sub = fsub float %vecext, %vecext1
78  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
79  %vecext2 = extractelement <4 x float> %A, i32 2
80  %vecext3 = extractelement <4 x float> %A, i32 3
81  %sub4 = fsub float %vecext2, %vecext3
82  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
83  %vecext6 = extractelement <4 x float> %B, i32 0
84  %vecext7 = extractelement <4 x float> %B, i32 1
85  %sub8 = fsub float %vecext6, %vecext7
86  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
87  %vecext10 = extractelement <4 x float> %B, i32 2
88  %vecext11 = extractelement <4 x float> %B, i32 3
89  %sub12 = fsub float %vecext10, %vecext11
90  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
91  ret <4 x float> %vecinit13
92}
93
94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
95; SSE-LABEL: hsub_ps_test2:
96; SSE:       # %bb.0:
97; SSE-NEXT:    hsubps %xmm1, %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: hsub_ps_test2:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
103; AVX-NEXT:    retq
104  %vecext = extractelement <4 x float> %A, i32 2
105  %vecext1 = extractelement <4 x float> %A, i32 3
106  %sub = fsub float %vecext, %vecext1
107  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
108  %vecext2 = extractelement <4 x float> %A, i32 0
109  %vecext3 = extractelement <4 x float> %A, i32 1
110  %sub4 = fsub float %vecext2, %vecext3
111  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
112  %vecext6 = extractelement <4 x float> %B, i32 2
113  %vecext7 = extractelement <4 x float> %B, i32 3
114  %sub8 = fsub float %vecext6, %vecext7
115  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
116  %vecext10 = extractelement <4 x float> %B, i32 0
117  %vecext11 = extractelement <4 x float> %B, i32 1
118  %sub12 = fsub float %vecext10, %vecext11
119  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
120  ret <4 x float> %vecinit13
121}
122
123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
124; SSE3-LABEL: phadd_d_test1:
125; SSE3:       # %bb.0:
126; SSE3-NEXT:    movd %xmm0, %eax
127; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
128; SSE3-NEXT:    movd %xmm2, %ecx
129; SSE3-NEXT:    addl %eax, %ecx
130; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
131; SSE3-NEXT:    movd %xmm2, %eax
132; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
133; SSE3-NEXT:    movd %xmm0, %edx
134; SSE3-NEXT:    addl %eax, %edx
135; SSE3-NEXT:    movd %xmm1, %eax
136; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
137; SSE3-NEXT:    movd %xmm0, %esi
138; SSE3-NEXT:    addl %eax, %esi
139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
140; SSE3-NEXT:    movd %xmm0, %eax
141; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
142; SSE3-NEXT:    movd %xmm0, %edi
143; SSE3-NEXT:    addl %eax, %edi
144; SSE3-NEXT:    movd %edi, %xmm0
145; SSE3-NEXT:    movd %esi, %xmm1
146; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
147; SSE3-NEXT:    movd %edx, %xmm2
148; SSE3-NEXT:    movd %ecx, %xmm0
149; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
150; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
151; SSE3-NEXT:    retq
152;
153; SSSE3-LABEL: phadd_d_test1:
154; SSSE3:       # %bb.0:
155; SSSE3-NEXT:    phaddd %xmm1, %xmm0
156; SSSE3-NEXT:    retq
157;
158; AVX-LABEL: phadd_d_test1:
159; AVX:       # %bb.0:
160; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
161; AVX-NEXT:    retq
162  %vecext = extractelement <4 x i32> %A, i32 0
163  %vecext1 = extractelement <4 x i32> %A, i32 1
164  %add = add i32 %vecext, %vecext1
165  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
166  %vecext2 = extractelement <4 x i32> %A, i32 2
167  %vecext3 = extractelement <4 x i32> %A, i32 3
168  %add4 = add i32 %vecext2, %vecext3
169  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
170  %vecext6 = extractelement <4 x i32> %B, i32 0
171  %vecext7 = extractelement <4 x i32> %B, i32 1
172  %add8 = add i32 %vecext6, %vecext7
173  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
174  %vecext10 = extractelement <4 x i32> %B, i32 2
175  %vecext11 = extractelement <4 x i32> %B, i32 3
176  %add12 = add i32 %vecext10, %vecext11
177  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
178  ret <4 x i32> %vecinit13
179}
180
181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
182; SSE3-LABEL: phadd_d_test2:
183; SSE3:       # %bb.0:
184; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
185; SSE3-NEXT:    movd %xmm2, %eax
186; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
187; SSE3-NEXT:    movd %xmm2, %ecx
188; SSE3-NEXT:    addl %eax, %ecx
189; SSE3-NEXT:    movd %xmm0, %eax
190; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
191; SSE3-NEXT:    movd %xmm0, %edx
192; SSE3-NEXT:    addl %eax, %edx
193; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
194; SSE3-NEXT:    movd %xmm0, %eax
195; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
196; SSE3-NEXT:    movd %xmm0, %esi
197; SSE3-NEXT:    addl %eax, %esi
198; SSE3-NEXT:    movd %esi, %xmm0
199; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
200; SSE3-NEXT:    movd %xmm2, %eax
201; SSE3-NEXT:    movd %xmm1, %esi
202; SSE3-NEXT:    addl %eax, %esi
203; SSE3-NEXT:    movd %esi, %xmm1
204; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
205; SSE3-NEXT:    movd %ecx, %xmm2
206; SSE3-NEXT:    movd %edx, %xmm0
207; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
208; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
209; SSE3-NEXT:    retq
210;
211; SSSE3-LABEL: phadd_d_test2:
212; SSSE3:       # %bb.0:
213; SSSE3-NEXT:    phaddd %xmm1, %xmm0
214; SSSE3-NEXT:    retq
215;
216; AVX-LABEL: phadd_d_test2:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %vecext = extractelement <4 x i32> %A, i32 2
221  %vecext1 = extractelement <4 x i32> %A, i32 3
222  %add = add i32 %vecext, %vecext1
223  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
224  %vecext2 = extractelement <4 x i32> %A, i32 0
225  %vecext3 = extractelement <4 x i32> %A, i32 1
226  %add4 = add i32 %vecext2, %vecext3
227  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
228  %vecext6 = extractelement <4 x i32> %B, i32 3
229  %vecext7 = extractelement <4 x i32> %B, i32 2
230  %add8 = add i32 %vecext6, %vecext7
231  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
232  %vecext10 = extractelement <4 x i32> %B, i32 1
233  %vecext11 = extractelement <4 x i32> %B, i32 0
234  %add12 = add i32 %vecext10, %vecext11
235  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
236  ret <4 x i32> %vecinit13
237}
238
239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
240; SSE3-LABEL: phsub_d_test1:
241; SSE3:       # %bb.0:
242; SSE3-NEXT:    movd %xmm0, %eax
243; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
244; SSE3-NEXT:    movd %xmm2, %ecx
245; SSE3-NEXT:    subl %ecx, %eax
246; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
247; SSE3-NEXT:    movd %xmm2, %ecx
248; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
249; SSE3-NEXT:    movd %xmm0, %edx
250; SSE3-NEXT:    subl %edx, %ecx
251; SSE3-NEXT:    movd %xmm1, %edx
252; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
253; SSE3-NEXT:    movd %xmm0, %esi
254; SSE3-NEXT:    subl %esi, %edx
255; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
256; SSE3-NEXT:    movd %xmm0, %esi
257; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
258; SSE3-NEXT:    movd %xmm0, %edi
259; SSE3-NEXT:    subl %edi, %esi
260; SSE3-NEXT:    movd %esi, %xmm0
261; SSE3-NEXT:    movd %edx, %xmm1
262; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
263; SSE3-NEXT:    movd %ecx, %xmm2
264; SSE3-NEXT:    movd %eax, %xmm0
265; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
266; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
267; SSE3-NEXT:    retq
268;
269; SSSE3-LABEL: phsub_d_test1:
270; SSSE3:       # %bb.0:
271; SSSE3-NEXT:    phsubd %xmm1, %xmm0
272; SSSE3-NEXT:    retq
273;
274; AVX-LABEL: phsub_d_test1:
275; AVX:       # %bb.0:
276; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    retq
278  %vecext = extractelement <4 x i32> %A, i32 0
279  %vecext1 = extractelement <4 x i32> %A, i32 1
280  %sub = sub i32 %vecext, %vecext1
281  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
282  %vecext2 = extractelement <4 x i32> %A, i32 2
283  %vecext3 = extractelement <4 x i32> %A, i32 3
284  %sub4 = sub i32 %vecext2, %vecext3
285  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
286  %vecext6 = extractelement <4 x i32> %B, i32 0
287  %vecext7 = extractelement <4 x i32> %B, i32 1
288  %sub8 = sub i32 %vecext6, %vecext7
289  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
290  %vecext10 = extractelement <4 x i32> %B, i32 2
291  %vecext11 = extractelement <4 x i32> %B, i32 3
292  %sub12 = sub i32 %vecext10, %vecext11
293  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
294  ret <4 x i32> %vecinit13
295}
296
297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
298; SSE3-LABEL: phsub_d_test2:
299; SSE3:       # %bb.0:
300; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
301; SSE3-NEXT:    movd %xmm2, %eax
302; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
303; SSE3-NEXT:    movd %xmm2, %ecx
304; SSE3-NEXT:    subl %ecx, %eax
305; SSE3-NEXT:    movd %xmm0, %ecx
306; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
307; SSE3-NEXT:    movd %xmm0, %edx
308; SSE3-NEXT:    subl %edx, %ecx
309; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
310; SSE3-NEXT:    movd %xmm0, %edx
311; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
312; SSE3-NEXT:    movd %xmm0, %esi
313; SSE3-NEXT:    subl %esi, %edx
314; SSE3-NEXT:    movd %edx, %xmm0
315; SSE3-NEXT:    movd %xmm1, %edx
316; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
317; SSE3-NEXT:    movd %xmm1, %esi
318; SSE3-NEXT:    subl %esi, %edx
319; SSE3-NEXT:    movd %edx, %xmm1
320; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
321; SSE3-NEXT:    movd %eax, %xmm2
322; SSE3-NEXT:    movd %ecx, %xmm0
323; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
324; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
325; SSE3-NEXT:    retq
326;
327; SSSE3-LABEL: phsub_d_test2:
328; SSSE3:       # %bb.0:
329; SSSE3-NEXT:    phsubd %xmm1, %xmm0
330; SSSE3-NEXT:    retq
331;
332; AVX-LABEL: phsub_d_test2:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
335; AVX-NEXT:    retq
336  %vecext = extractelement <4 x i32> %A, i32 2
337  %vecext1 = extractelement <4 x i32> %A, i32 3
338  %sub = sub i32 %vecext, %vecext1
339  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
340  %vecext2 = extractelement <4 x i32> %A, i32 0
341  %vecext3 = extractelement <4 x i32> %A, i32 1
342  %sub4 = sub i32 %vecext2, %vecext3
343  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
344  %vecext6 = extractelement <4 x i32> %B, i32 2
345  %vecext7 = extractelement <4 x i32> %B, i32 3
346  %sub8 = sub i32 %vecext6, %vecext7
347  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
348  %vecext10 = extractelement <4 x i32> %B, i32 0
349  %vecext11 = extractelement <4 x i32> %B, i32 1
350  %sub12 = sub i32 %vecext10, %vecext11
351  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
352  ret <4 x i32> %vecinit13
353}
354
355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
356; SSE-LABEL: hadd_pd_test1:
357; SSE:       # %bb.0:
358; SSE-NEXT:    haddpd %xmm1, %xmm0
359; SSE-NEXT:    retq
360;
361; AVX-LABEL: hadd_pd_test1:
362; AVX:       # %bb.0:
363; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %vecext = extractelement <2 x double> %A, i32 0
366  %vecext1 = extractelement <2 x double> %A, i32 1
367  %add = fadd double %vecext, %vecext1
368  %vecinit = insertelement <2 x double> undef, double %add, i32 0
369  %vecext2 = extractelement <2 x double> %B, i32 0
370  %vecext3 = extractelement <2 x double> %B, i32 1
371  %add2 = fadd double %vecext2, %vecext3
372  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
373  ret <2 x double> %vecinit2
374}
375
376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
377; SSE-LABEL: hadd_pd_test2:
378; SSE:       # %bb.0:
379; SSE-NEXT:    haddpd %xmm1, %xmm0
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: hadd_pd_test2:
383; AVX:       # %bb.0:
384; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %vecext = extractelement <2 x double> %A, i32 1
387  %vecext1 = extractelement <2 x double> %A, i32 0
388  %add = fadd double %vecext, %vecext1
389  %vecinit = insertelement <2 x double> undef, double %add, i32 0
390  %vecext2 = extractelement <2 x double> %B, i32 1
391  %vecext3 = extractelement <2 x double> %B, i32 0
392  %add2 = fadd double %vecext2, %vecext3
393  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
394  ret <2 x double> %vecinit2
395}
396
397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
398; SSE-LABEL: hsub_pd_test1:
399; SSE:       # %bb.0:
400; SSE-NEXT:    hsubpd %xmm1, %xmm0
401; SSE-NEXT:    retq
402;
403; AVX-LABEL: hsub_pd_test1:
404; AVX:       # %bb.0:
405; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
406; AVX-NEXT:    retq
407  %vecext = extractelement <2 x double> %A, i32 0
408  %vecext1 = extractelement <2 x double> %A, i32 1
409  %sub = fsub double %vecext, %vecext1
410  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
411  %vecext2 = extractelement <2 x double> %B, i32 0
412  %vecext3 = extractelement <2 x double> %B, i32 1
413  %sub2 = fsub double %vecext2, %vecext3
414  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
415  ret <2 x double> %vecinit2
416}
417
418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
419; SSE-LABEL: hsub_pd_test2:
420; SSE:       # %bb.0:
421; SSE-NEXT:    hsubpd %xmm1, %xmm0
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: hsub_pd_test2:
425; AVX:       # %bb.0:
426; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
427; AVX-NEXT:    retq
428  %vecext = extractelement <2 x double> %B, i32 0
429  %vecext1 = extractelement <2 x double> %B, i32 1
430  %sub = fsub double %vecext, %vecext1
431  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
432  %vecext2 = extractelement <2 x double> %A, i32 0
433  %vecext3 = extractelement <2 x double> %A, i32 1
434  %sub2 = fsub double %vecext2, %vecext3
435  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
436  ret <2 x double> %vecinit2
437}
438
439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
440; SSE-LABEL: avx_vhadd_pd_test:
441; SSE:       # %bb.0:
442; SSE-NEXT:    haddpd %xmm1, %xmm0
443; SSE-NEXT:    haddpd %xmm3, %xmm2
444; SSE-NEXT:    movapd %xmm2, %xmm1
445; SSE-NEXT:    retq
446;
447; AVX1-LABEL: avx_vhadd_pd_test:
448; AVX1:       # %bb.0:
449; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
450; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
451; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
452; AVX1-NEXT:    retq
453;
454; AVX2-LABEL: avx_vhadd_pd_test:
455; AVX2:       # %bb.0:
456; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
457; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
458; AVX2-NEXT:    retq
459  %vecext = extractelement <4 x double> %A, i32 0
460  %vecext1 = extractelement <4 x double> %A, i32 1
461  %add = fadd double %vecext, %vecext1
462  %vecinit = insertelement <4 x double> undef, double %add, i32 0
463  %vecext2 = extractelement <4 x double> %A, i32 2
464  %vecext3 = extractelement <4 x double> %A, i32 3
465  %add4 = fadd double %vecext2, %vecext3
466  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
467  %vecext6 = extractelement <4 x double> %B, i32 0
468  %vecext7 = extractelement <4 x double> %B, i32 1
469  %add8 = fadd double %vecext6, %vecext7
470  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
471  %vecext10 = extractelement <4 x double> %B, i32 2
472  %vecext11 = extractelement <4 x double> %B, i32 3
473  %add12 = fadd double %vecext10, %vecext11
474  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
475  ret <4 x double> %vecinit13
476}
477
478define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
479; SSE-LABEL: avx_vhsub_pd_test:
480; SSE:       # %bb.0:
481; SSE-NEXT:    hsubpd %xmm1, %xmm0
482; SSE-NEXT:    hsubpd %xmm3, %xmm2
483; SSE-NEXT:    movapd %xmm2, %xmm1
484; SSE-NEXT:    retq
485;
486; AVX1-LABEL: avx_vhsub_pd_test:
487; AVX1:       # %bb.0:
488; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
489; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
490; AVX1-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
491; AVX1-NEXT:    retq
492;
493; AVX2-LABEL: avx_vhsub_pd_test:
494; AVX2:       # %bb.0:
495; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
496; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
497; AVX2-NEXT:    retq
498  %vecext = extractelement <4 x double> %A, i32 0
499  %vecext1 = extractelement <4 x double> %A, i32 1
500  %sub = fsub double %vecext, %vecext1
501  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
502  %vecext2 = extractelement <4 x double> %A, i32 2
503  %vecext3 = extractelement <4 x double> %A, i32 3
504  %sub4 = fsub double %vecext2, %vecext3
505  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
506  %vecext6 = extractelement <4 x double> %B, i32 0
507  %vecext7 = extractelement <4 x double> %B, i32 1
508  %sub8 = fsub double %vecext6, %vecext7
509  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
510  %vecext10 = extractelement <4 x double> %B, i32 2
511  %vecext11 = extractelement <4 x double> %B, i32 3
512  %sub12 = fsub double %vecext10, %vecext11
513  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
514  ret <4 x double> %vecinit13
515}
516
517define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
518; SSE3-LABEL: avx2_vphadd_d_test:
519; SSE3:       # %bb.0:
520; SSE3-NEXT:    movd %xmm0, %ecx
521; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
522; SSE3-NEXT:    movd %xmm4, %eax
523; SSE3-NEXT:    addl %ecx, %eax
524; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
525; SSE3-NEXT:    movd %xmm4, %edx
526; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
527; SSE3-NEXT:    movd %xmm0, %ecx
528; SSE3-NEXT:    addl %edx, %ecx
529; SSE3-NEXT:    movd %xmm1, %edx
530; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
531; SSE3-NEXT:    movd %xmm0, %esi
532; SSE3-NEXT:    addl %edx, %esi
533; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
534; SSE3-NEXT:    movd %xmm0, %edx
535; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
536; SSE3-NEXT:    movd %xmm0, %edi
537; SSE3-NEXT:    addl %edx, %edi
538; SSE3-NEXT:    movd %xmm2, %r8d
539; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
540; SSE3-NEXT:    movd %xmm0, %edx
541; SSE3-NEXT:    addl %r8d, %edx
542; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
543; SSE3-NEXT:    movd %xmm0, %r8d
544; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
545; SSE3-NEXT:    movd %xmm0, %r9d
546; SSE3-NEXT:    addl %r8d, %r9d
547; SSE3-NEXT:    movd %xmm3, %r8d
548; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
549; SSE3-NEXT:    movd %xmm0, %r10d
550; SSE3-NEXT:    addl %r8d, %r10d
551; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
552; SSE3-NEXT:    movd %xmm0, %r8d
553; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
554; SSE3-NEXT:    movd %xmm0, %r11d
555; SSE3-NEXT:    addl %r8d, %r11d
556; SSE3-NEXT:    movd %edi, %xmm0
557; SSE3-NEXT:    movd %esi, %xmm1
558; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
559; SSE3-NEXT:    movd %ecx, %xmm2
560; SSE3-NEXT:    movd %eax, %xmm0
561; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
562; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
563; SSE3-NEXT:    movd %r11d, %xmm1
564; SSE3-NEXT:    movd %r10d, %xmm2
565; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
566; SSE3-NEXT:    movd %r9d, %xmm3
567; SSE3-NEXT:    movd %edx, %xmm1
568; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
569; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
570; SSE3-NEXT:    retq
571;
572; SSSE3-LABEL: avx2_vphadd_d_test:
573; SSSE3:       # %bb.0:
574; SSSE3-NEXT:    phaddd %xmm1, %xmm0
575; SSSE3-NEXT:    phaddd %xmm3, %xmm2
576; SSSE3-NEXT:    movdqa %xmm2, %xmm1
577; SSSE3-NEXT:    retq
578;
579; AVX1-LABEL: avx2_vphadd_d_test:
580; AVX1:       # %bb.0:
581; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
582; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
583; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
584; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
585; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
586; AVX1-NEXT:    retq
587;
588; AVX2-LABEL: avx2_vphadd_d_test:
589; AVX2:       # %bb.0:
590; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
591; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
592; AVX2-NEXT:    retq
593  %vecext = extractelement <8 x i32> %A, i32 0
594  %vecext1 = extractelement <8 x i32> %A, i32 1
595  %add = add i32 %vecext, %vecext1
596  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
597  %vecext2 = extractelement <8 x i32> %A, i32 2
598  %vecext3 = extractelement <8 x i32> %A, i32 3
599  %add4 = add i32 %vecext2, %vecext3
600  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
601  %vecext6 = extractelement <8 x i32> %A, i32 4
602  %vecext7 = extractelement <8 x i32> %A, i32 5
603  %add8 = add i32 %vecext6, %vecext7
604  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
605  %vecext10 = extractelement <8 x i32> %A, i32 6
606  %vecext11 = extractelement <8 x i32> %A, i32 7
607  %add12 = add i32 %vecext10, %vecext11
608  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
609  %vecext14 = extractelement <8 x i32> %B, i32 0
610  %vecext15 = extractelement <8 x i32> %B, i32 1
611  %add16 = add i32 %vecext14, %vecext15
612  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
613  %vecext18 = extractelement <8 x i32> %B, i32 2
614  %vecext19 = extractelement <8 x i32> %B, i32 3
615  %add20 = add i32 %vecext18, %vecext19
616  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
617  %vecext22 = extractelement <8 x i32> %B, i32 4
618  %vecext23 = extractelement <8 x i32> %B, i32 5
619  %add24 = add i32 %vecext22, %vecext23
620  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
621  %vecext26 = extractelement <8 x i32> %B, i32 6
622  %vecext27 = extractelement <8 x i32> %B, i32 7
623  %add28 = add i32 %vecext26, %vecext27
624  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
625  ret <8 x i32> %vecinit29
626}
627
628define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
629; SSE3-LABEL: avx2_vphadd_w_test:
630; SSE3:       # %bb.0:
631; SSE3-NEXT:    pushq %rbp
632; SSE3-NEXT:    pushq %r15
633; SSE3-NEXT:    pushq %r14
634; SSE3-NEXT:    pushq %r13
635; SSE3-NEXT:    pushq %r12
636; SSE3-NEXT:    pushq %rbx
637; SSE3-NEXT:    movd %xmm0, %ecx
638; SSE3-NEXT:    pextrw $1, %xmm0, %eax
639; SSE3-NEXT:    addl %ecx, %eax
640; SSE3-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
641; SSE3-NEXT:    pextrw $2, %xmm0, %edx
642; SSE3-NEXT:    pextrw $3, %xmm0, %eax
643; SSE3-NEXT:    addl %edx, %eax
644; SSE3-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
645; SSE3-NEXT:    pextrw $4, %xmm0, %edx
646; SSE3-NEXT:    pextrw $5, %xmm0, %esi
647; SSE3-NEXT:    addl %edx, %esi
648; SSE3-NEXT:    pextrw $6, %xmm0, %edx
649; SSE3-NEXT:    pextrw $7, %xmm0, %r8d
650; SSE3-NEXT:    addl %edx, %r8d
651; SSE3-NEXT:    movd %xmm1, %edx
652; SSE3-NEXT:    pextrw $1, %xmm1, %r10d
653; SSE3-NEXT:    addl %edx, %r10d
654; SSE3-NEXT:    pextrw $2, %xmm1, %edx
655; SSE3-NEXT:    pextrw $3, %xmm1, %ebx
656; SSE3-NEXT:    addl %edx, %ebx
657; SSE3-NEXT:    pextrw $4, %xmm1, %edx
658; SSE3-NEXT:    pextrw $5, %xmm1, %r14d
659; SSE3-NEXT:    addl %edx, %r14d
660; SSE3-NEXT:    pextrw $6, %xmm1, %edx
661; SSE3-NEXT:    pextrw $7, %xmm1, %r12d
662; SSE3-NEXT:    addl %edx, %r12d
663; SSE3-NEXT:    movd %xmm2, %edi
664; SSE3-NEXT:    pextrw $1, %xmm2, %edx
665; SSE3-NEXT:    addl %edi, %edx
666; SSE3-NEXT:    pextrw $2, %xmm2, %r9d
667; SSE3-NEXT:    pextrw $3, %xmm2, %edi
668; SSE3-NEXT:    addl %r9d, %edi
669; SSE3-NEXT:    pextrw $4, %xmm2, %r11d
670; SSE3-NEXT:    pextrw $5, %xmm2, %r9d
671; SSE3-NEXT:    addl %r11d, %r9d
672; SSE3-NEXT:    pextrw $6, %xmm2, %ebp
673; SSE3-NEXT:    pextrw $7, %xmm2, %r11d
674; SSE3-NEXT:    addl %ebp, %r11d
675; SSE3-NEXT:    movd %xmm3, %r15d
676; SSE3-NEXT:    pextrw $1, %xmm3, %ebp
677; SSE3-NEXT:    addl %r15d, %ebp
678; SSE3-NEXT:    pextrw $2, %xmm3, %r13d
679; SSE3-NEXT:    pextrw $3, %xmm3, %r15d
680; SSE3-NEXT:    addl %r13d, %r15d
681; SSE3-NEXT:    pextrw $4, %xmm3, %r13d
682; SSE3-NEXT:    pextrw $5, %xmm3, %ecx
683; SSE3-NEXT:    addl %r13d, %ecx
684; SSE3-NEXT:    pextrw $6, %xmm3, %r13d
685; SSE3-NEXT:    pextrw $7, %xmm3, %eax
686; SSE3-NEXT:    addl %r13d, %eax
687; SSE3-NEXT:    movd %r12d, %xmm4
688; SSE3-NEXT:    movd %r14d, %xmm2
689; SSE3-NEXT:    movd %ebx, %xmm5
690; SSE3-NEXT:    movd %r10d, %xmm3
691; SSE3-NEXT:    movd %r8d, %xmm6
692; SSE3-NEXT:    movd %esi, %xmm7
693; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload
694; SSE3-NEXT:    # xmm8 = mem[0],zero,zero,zero
695; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
696; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
697; SSE3-NEXT:    movd %eax, %xmm9
698; SSE3-NEXT:    movd %ecx, %xmm10
699; SSE3-NEXT:    movd %r15d, %xmm11
700; SSE3-NEXT:    movd %ebp, %xmm12
701; SSE3-NEXT:    movd %r11d, %xmm13
702; SSE3-NEXT:    movd %r9d, %xmm14
703; SSE3-NEXT:    movd %edi, %xmm15
704; SSE3-NEXT:    movd %edx, %xmm1
705; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
706; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
707; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
708; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
709; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
710; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
711; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
712; SSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
713; SSE3-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
714; SSE3-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
715; SSE3-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
716; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
717; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
718; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0]
719; SSE3-NEXT:    popq %rbx
720; SSE3-NEXT:    popq %r12
721; SSE3-NEXT:    popq %r13
722; SSE3-NEXT:    popq %r14
723; SSE3-NEXT:    popq %r15
724; SSE3-NEXT:    popq %rbp
725; SSE3-NEXT:    retq
726;
727; SSSE3-LABEL: avx2_vphadd_w_test:
728; SSSE3:       # %bb.0:
729; SSSE3-NEXT:    phaddw %xmm1, %xmm0
730; SSSE3-NEXT:    phaddw %xmm3, %xmm2
731; SSSE3-NEXT:    movdqa %xmm2, %xmm1
732; SSSE3-NEXT:    retq
733;
734; AVX1-LABEL: avx2_vphadd_w_test:
735; AVX1:       # %bb.0:
736; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
737; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
738; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
739; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
740; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
741; AVX1-NEXT:    retq
742;
743; AVX2-LABEL: avx2_vphadd_w_test:
744; AVX2:       # %bb.0:
745; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
746; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
747; AVX2-NEXT:    retq
748  %vecext = extractelement <16 x i16> %a, i32 0
749  %vecext1 = extractelement <16 x i16> %a, i32 1
750  %add = add i16 %vecext, %vecext1
751  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
752  %vecext4 = extractelement <16 x i16> %a, i32 2
753  %vecext6 = extractelement <16 x i16> %a, i32 3
754  %add8 = add i16 %vecext4, %vecext6
755  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
756  %vecext11 = extractelement <16 x i16> %a, i32 4
757  %vecext13 = extractelement <16 x i16> %a, i32 5
758  %add15 = add i16 %vecext11, %vecext13
759  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
760  %vecext18 = extractelement <16 x i16> %a, i32 6
761  %vecext20 = extractelement <16 x i16> %a, i32 7
762  %add22 = add i16 %vecext18, %vecext20
763  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
764  %vecext25 = extractelement <16 x i16> %a, i32 8
765  %vecext27 = extractelement <16 x i16> %a, i32 9
766  %add29 = add i16 %vecext25, %vecext27
767  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
768  %vecext32 = extractelement <16 x i16> %a, i32 10
769  %vecext34 = extractelement <16 x i16> %a, i32 11
770  %add36 = add i16 %vecext32, %vecext34
771  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
772  %vecext39 = extractelement <16 x i16> %a, i32 12
773  %vecext41 = extractelement <16 x i16> %a, i32 13
774  %add43 = add i16 %vecext39, %vecext41
775  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
776  %vecext46 = extractelement <16 x i16> %a, i32 14
777  %vecext48 = extractelement <16 x i16> %a, i32 15
778  %add50 = add i16 %vecext46, %vecext48
779  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
780  %vecext53 = extractelement <16 x i16> %b, i32 0
781  %vecext55 = extractelement <16 x i16> %b, i32 1
782  %add57 = add i16 %vecext53, %vecext55
783  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
784  %vecext60 = extractelement <16 x i16> %b, i32 2
785  %vecext62 = extractelement <16 x i16> %b, i32 3
786  %add64 = add i16 %vecext60, %vecext62
787  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
788  %vecext67 = extractelement <16 x i16> %b, i32 4
789  %vecext69 = extractelement <16 x i16> %b, i32 5
790  %add71 = add i16 %vecext67, %vecext69
791  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
792  %vecext74 = extractelement <16 x i16> %b, i32 6
793  %vecext76 = extractelement <16 x i16> %b, i32 7
794  %add78 = add i16 %vecext74, %vecext76
795  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
796  %vecext81 = extractelement <16 x i16> %b, i32 8
797  %vecext83 = extractelement <16 x i16> %b, i32 9
798  %add85 = add i16 %vecext81, %vecext83
799  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
800  %vecext88 = extractelement <16 x i16> %b, i32 10
801  %vecext90 = extractelement <16 x i16> %b, i32 11
802  %add92 = add i16 %vecext88, %vecext90
803  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
804  %vecext95 = extractelement <16 x i16> %b, i32 12
805  %vecext97 = extractelement <16 x i16> %b, i32 13
806  %add99 = add i16 %vecext95, %vecext97
807  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
808  %vecext102 = extractelement <16 x i16> %b, i32 14
809  %vecext104 = extractelement <16 x i16> %b, i32 15
810  %add106 = add i16 %vecext102, %vecext104
811  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
812  ret <16 x i16> %vecinit108
813}
814
815; Verify that we don't select horizontal subs in the following functions.
816
817define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
818; SSE-LABEL: not_a_hsub_1:
819; SSE:       # %bb.0:
820; SSE-NEXT:    movd %xmm0, %eax
821; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
822; SSE-NEXT:    movd %xmm2, %ecx
823; SSE-NEXT:    subl %ecx, %eax
824; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
825; SSE-NEXT:    movd %xmm2, %ecx
826; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
827; SSE-NEXT:    movd %xmm0, %edx
828; SSE-NEXT:    subl %edx, %ecx
829; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
830; SSE-NEXT:    movd %xmm0, %edx
831; SSE-NEXT:    movd %xmm1, %esi
832; SSE-NEXT:    subl %esi, %edx
833; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
834; SSE-NEXT:    movd %xmm0, %esi
835; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
836; SSE-NEXT:    movd %xmm0, %edi
837; SSE-NEXT:    subl %edi, %esi
838; SSE-NEXT:    movd %esi, %xmm0
839; SSE-NEXT:    movd %edx, %xmm1
840; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
841; SSE-NEXT:    movd %ecx, %xmm2
842; SSE-NEXT:    movd %eax, %xmm0
843; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
844; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
845; SSE-NEXT:    retq
846;
847; AVX-LABEL: not_a_hsub_1:
848; AVX:       # %bb.0:
849; AVX-NEXT:    vmovd %xmm0, %eax
850; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
851; AVX-NEXT:    subl %ecx, %eax
852; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
853; AVX-NEXT:    vpextrd $3, %xmm0, %edx
854; AVX-NEXT:    subl %edx, %ecx
855; AVX-NEXT:    vpextrd $1, %xmm1, %edx
856; AVX-NEXT:    vmovd %xmm1, %esi
857; AVX-NEXT:    subl %esi, %edx
858; AVX-NEXT:    vpextrd $3, %xmm1, %esi
859; AVX-NEXT:    vpextrd $2, %xmm1, %edi
860; AVX-NEXT:    subl %edi, %esi
861; AVX-NEXT:    vmovd %eax, %xmm0
862; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
863; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
864; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
865; AVX-NEXT:    retq
866  %vecext = extractelement <4 x i32> %A, i32 0
867  %vecext1 = extractelement <4 x i32> %A, i32 1
868  %sub = sub i32 %vecext, %vecext1
869  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
870  %vecext2 = extractelement <4 x i32> %A, i32 2
871  %vecext3 = extractelement <4 x i32> %A, i32 3
872  %sub4 = sub i32 %vecext2, %vecext3
873  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
874  %vecext6 = extractelement <4 x i32> %B, i32 1
875  %vecext7 = extractelement <4 x i32> %B, i32 0
876  %sub8 = sub i32 %vecext6, %vecext7
877  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
878  %vecext10 = extractelement <4 x i32> %B, i32 3
879  %vecext11 = extractelement <4 x i32> %B, i32 2
880  %sub12 = sub i32 %vecext10, %vecext11
881  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
882  ret <4 x i32> %vecinit13
883}
884
885define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
886; SSE-LABEL: not_a_hsub_2:
887; SSE:       # %bb.0:
888; SSE-NEXT:    movaps %xmm0, %xmm2
889; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
890; SSE-NEXT:    movaps %xmm0, %xmm3
891; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
892; SSE-NEXT:    subss %xmm3, %xmm2
893; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
894; SSE-NEXT:    subss %xmm3, %xmm0
895; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
896; SSE-NEXT:    movaps %xmm1, %xmm2
897; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
898; SSE-NEXT:    movaps %xmm1, %xmm3
899; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
900; SSE-NEXT:    subss %xmm3, %xmm2
901; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
902; SSE-NEXT:    subss %xmm3, %xmm1
903; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
904; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
905; SSE-NEXT:    retq
906;
907; AVX-LABEL: not_a_hsub_2:
908; AVX:       # %bb.0:
909; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
910; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
911; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
912; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
913; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
914; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
915; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
916; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
917; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
918; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
919; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
920; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
921; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
922; AVX-NEXT:    retq
923  %vecext = extractelement <4 x float> %A, i32 2
924  %vecext1 = extractelement <4 x float> %A, i32 3
925  %sub = fsub float %vecext, %vecext1
926  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
927  %vecext2 = extractelement <4 x float> %A, i32 0
928  %vecext3 = extractelement <4 x float> %A, i32 1
929  %sub4 = fsub float %vecext2, %vecext3
930  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
931  %vecext6 = extractelement <4 x float> %B, i32 3
932  %vecext7 = extractelement <4 x float> %B, i32 2
933  %sub8 = fsub float %vecext6, %vecext7
934  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
935  %vecext10 = extractelement <4 x float> %B, i32 0
936  %vecext11 = extractelement <4 x float> %B, i32 1
937  %sub12 = fsub float %vecext10, %vecext11
938  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
939  ret <4 x float> %vecinit13
940}
941
942define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
943; SSE-LABEL: not_a_hsub_3:
944; SSE:       # %bb.0:
945; SSE-NEXT:    movapd %xmm1, %xmm2
946; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
947; SSE-NEXT:    subsd %xmm2, %xmm1
948; SSE-NEXT:    movapd %xmm0, %xmm2
949; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
950; SSE-NEXT:    subsd %xmm0, %xmm2
951; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
952; SSE-NEXT:    movapd %xmm2, %xmm0
953; SSE-NEXT:    retq
954;
955; AVX-LABEL: not_a_hsub_3:
956; AVX:       # %bb.0:
957; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
958; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
959; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
960; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
961; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
962; AVX-NEXT:    retq
963  %vecext = extractelement <2 x double> %B, i32 0
964  %vecext1 = extractelement <2 x double> %B, i32 1
965  %sub = fsub double %vecext, %vecext1
966  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
967  %vecext2 = extractelement <2 x double> %A, i32 1
968  %vecext3 = extractelement <2 x double> %A, i32 0
969  %sub2 = fsub double %vecext2, %vecext3
970  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
971  ret <2 x double> %vecinit2
972}
973
974; Test AVX horizontal add/sub of packed single/double precision
975; floating point values from 256-bit vectors.
976
977define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
978; SSE-LABEL: avx_vhadd_ps:
979; SSE:       # %bb.0:
980; SSE-NEXT:    haddps %xmm2, %xmm0
981; SSE-NEXT:    haddps %xmm3, %xmm1
982; SSE-NEXT:    retq
983;
984; AVX-LABEL: avx_vhadd_ps:
985; AVX:       # %bb.0:
986; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
987; AVX-NEXT:    retq
988  %vecext = extractelement <8 x float> %a, i32 0
989  %vecext1 = extractelement <8 x float> %a, i32 1
990  %add = fadd float %vecext, %vecext1
991  %vecinit = insertelement <8 x float> undef, float %add, i32 0
992  %vecext2 = extractelement <8 x float> %a, i32 2
993  %vecext3 = extractelement <8 x float> %a, i32 3
994  %add4 = fadd float %vecext2, %vecext3
995  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
996  %vecext6 = extractelement <8 x float> %b, i32 0
997  %vecext7 = extractelement <8 x float> %b, i32 1
998  %add8 = fadd float %vecext6, %vecext7
999  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
1000  %vecext10 = extractelement <8 x float> %b, i32 2
1001  %vecext11 = extractelement <8 x float> %b, i32 3
1002  %add12 = fadd float %vecext10, %vecext11
1003  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
1004  %vecext14 = extractelement <8 x float> %a, i32 4
1005  %vecext15 = extractelement <8 x float> %a, i32 5
1006  %add16 = fadd float %vecext14, %vecext15
1007  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
1008  %vecext18 = extractelement <8 x float> %a, i32 6
1009  %vecext19 = extractelement <8 x float> %a, i32 7
1010  %add20 = fadd float %vecext18, %vecext19
1011  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
1012  %vecext22 = extractelement <8 x float> %b, i32 4
1013  %vecext23 = extractelement <8 x float> %b, i32 5
1014  %add24 = fadd float %vecext22, %vecext23
1015  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
1016  %vecext26 = extractelement <8 x float> %b, i32 6
1017  %vecext27 = extractelement <8 x float> %b, i32 7
1018  %add28 = fadd float %vecext26, %vecext27
1019  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
1020  ret <8 x float> %vecinit29
1021}
1022
1023define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
1024; SSE-LABEL: avx_vhsub_ps:
1025; SSE:       # %bb.0:
1026; SSE-NEXT:    hsubps %xmm2, %xmm0
1027; SSE-NEXT:    hsubps %xmm3, %xmm1
1028; SSE-NEXT:    retq
1029;
1030; AVX-LABEL: avx_vhsub_ps:
1031; AVX:       # %bb.0:
1032; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
1033; AVX-NEXT:    retq
1034  %vecext = extractelement <8 x float> %a, i32 0
1035  %vecext1 = extractelement <8 x float> %a, i32 1
1036  %sub = fsub float %vecext, %vecext1
1037  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
1038  %vecext2 = extractelement <8 x float> %a, i32 2
1039  %vecext3 = extractelement <8 x float> %a, i32 3
1040  %sub4 = fsub float %vecext2, %vecext3
1041  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
1042  %vecext6 = extractelement <8 x float> %b, i32 0
1043  %vecext7 = extractelement <8 x float> %b, i32 1
1044  %sub8 = fsub float %vecext6, %vecext7
1045  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
1046  %vecext10 = extractelement <8 x float> %b, i32 2
1047  %vecext11 = extractelement <8 x float> %b, i32 3
1048  %sub12 = fsub float %vecext10, %vecext11
1049  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
1050  %vecext14 = extractelement <8 x float> %a, i32 4
1051  %vecext15 = extractelement <8 x float> %a, i32 5
1052  %sub16 = fsub float %vecext14, %vecext15
1053  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
1054  %vecext18 = extractelement <8 x float> %a, i32 6
1055  %vecext19 = extractelement <8 x float> %a, i32 7
1056  %sub20 = fsub float %vecext18, %vecext19
1057  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
1058  %vecext22 = extractelement <8 x float> %b, i32 4
1059  %vecext23 = extractelement <8 x float> %b, i32 5
1060  %sub24 = fsub float %vecext22, %vecext23
1061  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
1062  %vecext26 = extractelement <8 x float> %b, i32 6
1063  %vecext27 = extractelement <8 x float> %b, i32 7
1064  %sub28 = fsub float %vecext26, %vecext27
1065  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
1066  ret <8 x float> %vecinit29
1067}
1068
1069define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
1070; SSE-LABEL: avx_hadd_pd:
1071; SSE:       # %bb.0:
1072; SSE-NEXT:    haddpd %xmm2, %xmm0
1073; SSE-NEXT:    haddpd %xmm3, %xmm1
1074; SSE-NEXT:    retq
1075;
1076; AVX-LABEL: avx_hadd_pd:
1077; AVX:       # %bb.0:
1078; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1079; AVX-NEXT:    retq
1080  %vecext = extractelement <4 x double> %a, i32 0
1081  %vecext1 = extractelement <4 x double> %a, i32 1
1082  %add = fadd double %vecext, %vecext1
1083  %vecinit = insertelement <4 x double> undef, double %add, i32 0
1084  %vecext2 = extractelement <4 x double> %b, i32 0
1085  %vecext3 = extractelement <4 x double> %b, i32 1
1086  %add4 = fadd double %vecext2, %vecext3
1087  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
1088  %vecext6 = extractelement <4 x double> %a, i32 2
1089  %vecext7 = extractelement <4 x double> %a, i32 3
1090  %add8 = fadd double %vecext6, %vecext7
1091  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
1092  %vecext10 = extractelement <4 x double> %b, i32 2
1093  %vecext11 = extractelement <4 x double> %b, i32 3
1094  %add12 = fadd double %vecext10, %vecext11
1095  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
1096  ret <4 x double> %vecinit13
1097}
1098
1099define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
1100; SSE-LABEL: avx_hsub_pd:
1101; SSE:       # %bb.0:
1102; SSE-NEXT:    hsubpd %xmm2, %xmm0
1103; SSE-NEXT:    hsubpd %xmm3, %xmm1
1104; SSE-NEXT:    retq
1105;
1106; AVX-LABEL: avx_hsub_pd:
1107; AVX:       # %bb.0:
1108; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1109; AVX-NEXT:    retq
1110  %vecext = extractelement <4 x double> %a, i32 0
1111  %vecext1 = extractelement <4 x double> %a, i32 1
1112  %sub = fsub double %vecext, %vecext1
1113  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
1114  %vecext2 = extractelement <4 x double> %b, i32 0
1115  %vecext3 = extractelement <4 x double> %b, i32 1
1116  %sub4 = fsub double %vecext2, %vecext3
1117  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
1118  %vecext6 = extractelement <4 x double> %a, i32 2
1119  %vecext7 = extractelement <4 x double> %a, i32 3
1120  %sub8 = fsub double %vecext6, %vecext7
1121  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
1122  %vecext10 = extractelement <4 x double> %b, i32 2
1123  %vecext11 = extractelement <4 x double> %b, i32 3
1124  %sub12 = fsub double %vecext10, %vecext11
1125  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
1126  ret <4 x double> %vecinit13
1127}
1128
1129; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
1130
1131define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
1132; SSE3-LABEL: avx2_hadd_d:
1133; SSE3:       # %bb.0:
1134; SSE3-NEXT:    movd %xmm0, %ecx
1135; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1136; SSE3-NEXT:    movd %xmm4, %eax
1137; SSE3-NEXT:    addl %ecx, %eax
1138; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1139; SSE3-NEXT:    movd %xmm4, %edx
1140; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1141; SSE3-NEXT:    movd %xmm0, %ecx
1142; SSE3-NEXT:    addl %edx, %ecx
1143; SSE3-NEXT:    movd %xmm2, %edx
1144; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1145; SSE3-NEXT:    movd %xmm0, %esi
1146; SSE3-NEXT:    addl %edx, %esi
1147; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1148; SSE3-NEXT:    movd %xmm0, %edx
1149; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
1150; SSE3-NEXT:    movd %xmm0, %edi
1151; SSE3-NEXT:    addl %edx, %edi
1152; SSE3-NEXT:    movd %xmm1, %r8d
1153; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1154; SSE3-NEXT:    movd %xmm0, %edx
1155; SSE3-NEXT:    addl %r8d, %edx
1156; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1157; SSE3-NEXT:    movd %xmm0, %r8d
1158; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
1159; SSE3-NEXT:    movd %xmm0, %r9d
1160; SSE3-NEXT:    addl %r8d, %r9d
1161; SSE3-NEXT:    movd %xmm3, %r8d
1162; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
1163; SSE3-NEXT:    movd %xmm0, %r10d
1164; SSE3-NEXT:    addl %r8d, %r10d
1165; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1166; SSE3-NEXT:    movd %xmm0, %r8d
1167; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
1168; SSE3-NEXT:    movd %xmm0, %r11d
1169; SSE3-NEXT:    addl %r8d, %r11d
1170; SSE3-NEXT:    movd %edi, %xmm0
1171; SSE3-NEXT:    movd %esi, %xmm1
1172; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1173; SSE3-NEXT:    movd %ecx, %xmm2
1174; SSE3-NEXT:    movd %eax, %xmm0
1175; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1176; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1177; SSE3-NEXT:    movd %r11d, %xmm1
1178; SSE3-NEXT:    movd %r10d, %xmm2
1179; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1180; SSE3-NEXT:    movd %r9d, %xmm3
1181; SSE3-NEXT:    movd %edx, %xmm1
1182; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1183; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1184; SSE3-NEXT:    retq
1185;
1186; SSSE3-LABEL: avx2_hadd_d:
1187; SSSE3:       # %bb.0:
1188; SSSE3-NEXT:    phaddd %xmm2, %xmm0
1189; SSSE3-NEXT:    phaddd %xmm3, %xmm1
1190; SSSE3-NEXT:    retq
1191;
1192; AVX1-LABEL: avx2_hadd_d:
1193; AVX1:       # %bb.0:
1194; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1195; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1196; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
1197; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1198; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1199; AVX1-NEXT:    retq
1200;
1201; AVX2-LABEL: avx2_hadd_d:
1202; AVX2:       # %bb.0:
1203; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1204; AVX2-NEXT:    retq
1205  %vecext = extractelement <8 x i32> %a, i32 0
1206  %vecext1 = extractelement <8 x i32> %a, i32 1
1207  %add = add i32 %vecext, %vecext1
1208  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
1209  %vecext2 = extractelement <8 x i32> %a, i32 2
1210  %vecext3 = extractelement <8 x i32> %a, i32 3
1211  %add4 = add i32 %vecext2, %vecext3
1212  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
1213  %vecext6 = extractelement <8 x i32> %b, i32 0
1214  %vecext7 = extractelement <8 x i32> %b, i32 1
1215  %add8 = add i32 %vecext6, %vecext7
1216  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
1217  %vecext10 = extractelement <8 x i32> %b, i32 2
1218  %vecext11 = extractelement <8 x i32> %b, i32 3
1219  %add12 = add i32 %vecext10, %vecext11
1220  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
1221  %vecext14 = extractelement <8 x i32> %a, i32 4
1222  %vecext15 = extractelement <8 x i32> %a, i32 5
1223  %add16 = add i32 %vecext14, %vecext15
1224  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
1225  %vecext18 = extractelement <8 x i32> %a, i32 6
1226  %vecext19 = extractelement <8 x i32> %a, i32 7
1227  %add20 = add i32 %vecext18, %vecext19
1228  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
1229  %vecext22 = extractelement <8 x i32> %b, i32 4
1230  %vecext23 = extractelement <8 x i32> %b, i32 5
1231  %add24 = add i32 %vecext22, %vecext23
1232  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
1233  %vecext26 = extractelement <8 x i32> %b, i32 6
1234  %vecext27 = extractelement <8 x i32> %b, i32 7
1235  %add28 = add i32 %vecext26, %vecext27
1236  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
1237  ret <8 x i32> %vecinit29
1238}
1239
1240define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind {
1241; SSE3-LABEL: avx2_hadd_w:
1242; SSE3:       # %bb.0:
1243; SSE3-NEXT:    pushq %rbp
1244; SSE3-NEXT:    pushq %r15
1245; SSE3-NEXT:    pushq %r14
1246; SSE3-NEXT:    pushq %r13
1247; SSE3-NEXT:    pushq %r12
1248; SSE3-NEXT:    pushq %rbx
1249; SSE3-NEXT:    movd %xmm0, %eax
1250; SSE3-NEXT:    pextrw $1, %xmm0, %edx
1251; SSE3-NEXT:    addl %eax, %edx
1252; SSE3-NEXT:    pextrw $2, %xmm0, %eax
1253; SSE3-NEXT:    pextrw $3, %xmm0, %esi
1254; SSE3-NEXT:    addl %eax, %esi
1255; SSE3-NEXT:    pextrw $4, %xmm0, %eax
1256; SSE3-NEXT:    pextrw $5, %xmm0, %r9d
1257; SSE3-NEXT:    addl %eax, %r9d
1258; SSE3-NEXT:    pextrw $6, %xmm0, %eax
1259; SSE3-NEXT:    pextrw $7, %xmm0, %r10d
1260; SSE3-NEXT:    addl %eax, %r10d
1261; SSE3-NEXT:    movd %xmm1, %ecx
1262; SSE3-NEXT:    pextrw $1, %xmm1, %eax
1263; SSE3-NEXT:    addl %ecx, %eax
1264; SSE3-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1265; SSE3-NEXT:    pextrw $2, %xmm1, %edi
1266; SSE3-NEXT:    pextrw $3, %xmm1, %eax
1267; SSE3-NEXT:    addl %edi, %eax
1268; SSE3-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1269; SSE3-NEXT:    pextrw $4, %xmm1, %r8d
1270; SSE3-NEXT:    pextrw $5, %xmm1, %edi
1271; SSE3-NEXT:    addl %r8d, %edi
1272; SSE3-NEXT:    pextrw $6, %xmm1, %r11d
1273; SSE3-NEXT:    pextrw $7, %xmm1, %r8d
1274; SSE3-NEXT:    addl %r11d, %r8d
1275; SSE3-NEXT:    movd %xmm2, %r11d
1276; SSE3-NEXT:    pextrw $1, %xmm2, %ebp
1277; SSE3-NEXT:    addl %r11d, %ebp
1278; SSE3-NEXT:    pextrw $2, %xmm2, %r11d
1279; SSE3-NEXT:    pextrw $3, %xmm2, %r14d
1280; SSE3-NEXT:    addl %r11d, %r14d
1281; SSE3-NEXT:    pextrw $4, %xmm2, %r11d
1282; SSE3-NEXT:    pextrw $5, %xmm2, %r15d
1283; SSE3-NEXT:    addl %r11d, %r15d
1284; SSE3-NEXT:    pextrw $6, %xmm2, %r11d
1285; SSE3-NEXT:    pextrw $7, %xmm2, %r12d
1286; SSE3-NEXT:    addl %r11d, %r12d
1287; SSE3-NEXT:    movd %xmm3, %ebx
1288; SSE3-NEXT:    pextrw $1, %xmm3, %r11d
1289; SSE3-NEXT:    addl %ebx, %r11d
1290; SSE3-NEXT:    pextrw $2, %xmm3, %r13d
1291; SSE3-NEXT:    pextrw $3, %xmm3, %ebx
1292; SSE3-NEXT:    addl %r13d, %ebx
1293; SSE3-NEXT:    pextrw $4, %xmm3, %r13d
1294; SSE3-NEXT:    pextrw $5, %xmm3, %ecx
1295; SSE3-NEXT:    addl %r13d, %ecx
1296; SSE3-NEXT:    pextrw $6, %xmm3, %r13d
1297; SSE3-NEXT:    pextrw $7, %xmm3, %eax
1298; SSE3-NEXT:    addl %r13d, %eax
1299; SSE3-NEXT:    movd %r12d, %xmm4
1300; SSE3-NEXT:    movd %r15d, %xmm2
1301; SSE3-NEXT:    movd %r14d, %xmm5
1302; SSE3-NEXT:    movd %ebp, %xmm3
1303; SSE3-NEXT:    movd %r10d, %xmm6
1304; SSE3-NEXT:    movd %r9d, %xmm7
1305; SSE3-NEXT:    movd %esi, %xmm8
1306; SSE3-NEXT:    movd %edx, %xmm0
1307; SSE3-NEXT:    movd %eax, %xmm9
1308; SSE3-NEXT:    movd %ecx, %xmm10
1309; SSE3-NEXT:    movd %ebx, %xmm11
1310; SSE3-NEXT:    movd %r11d, %xmm12
1311; SSE3-NEXT:    movd %r8d, %xmm13
1312; SSE3-NEXT:    movd %edi, %xmm14
1313; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
1314; SSE3-NEXT:    # xmm15 = mem[0],zero,zero,zero
1315; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
1316; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
1317; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1318; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1319; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1320; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1321; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
1322; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
1323; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1324; SSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1325; SSE3-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1326; SSE3-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1327; SSE3-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1328; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
1329; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
1330; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0]
1331; SSE3-NEXT:    popq %rbx
1332; SSE3-NEXT:    popq %r12
1333; SSE3-NEXT:    popq %r13
1334; SSE3-NEXT:    popq %r14
1335; SSE3-NEXT:    popq %r15
1336; SSE3-NEXT:    popq %rbp
1337; SSE3-NEXT:    retq
1338;
1339; SSSE3-LABEL: avx2_hadd_w:
1340; SSSE3:       # %bb.0:
1341; SSSE3-NEXT:    phaddw %xmm2, %xmm0
1342; SSSE3-NEXT:    phaddw %xmm3, %xmm1
1343; SSSE3-NEXT:    retq
1344;
1345; AVX1-LABEL: avx2_hadd_w:
1346; AVX1:       # %bb.0:
1347; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1348; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1349; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
1350; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
1351; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1352; AVX1-NEXT:    retq
1353;
1354; AVX2-LABEL: avx2_hadd_w:
1355; AVX2:       # %bb.0:
1356; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1357; AVX2-NEXT:    retq
1358  %vecext = extractelement <16 x i16> %a, i32 0
1359  %vecext1 = extractelement <16 x i16> %a, i32 1
1360  %add = add i16 %vecext, %vecext1
1361  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
1362  %vecext4 = extractelement <16 x i16> %a, i32 2
1363  %vecext6 = extractelement <16 x i16> %a, i32 3
1364  %add8 = add i16 %vecext4, %vecext6
1365  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
1366  %vecext11 = extractelement <16 x i16> %a, i32 4
1367  %vecext13 = extractelement <16 x i16> %a, i32 5
1368  %add15 = add i16 %vecext11, %vecext13
1369  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
1370  %vecext18 = extractelement <16 x i16> %a, i32 6
1371  %vecext20 = extractelement <16 x i16> %a, i32 7
1372  %add22 = add i16 %vecext18, %vecext20
1373  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
1374  %vecext25 = extractelement <16 x i16> %a, i32 8
1375  %vecext27 = extractelement <16 x i16> %a, i32 9
1376  %add29 = add i16 %vecext25, %vecext27
1377  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
1378  %vecext32 = extractelement <16 x i16> %a, i32 10
1379  %vecext34 = extractelement <16 x i16> %a, i32 11
1380  %add36 = add i16 %vecext32, %vecext34
1381  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
1382  %vecext39 = extractelement <16 x i16> %a, i32 12
1383  %vecext41 = extractelement <16 x i16> %a, i32 13
1384  %add43 = add i16 %vecext39, %vecext41
1385  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
1386  %vecext46 = extractelement <16 x i16> %a, i32 14
1387  %vecext48 = extractelement <16 x i16> %a, i32 15
1388  %add50 = add i16 %vecext46, %vecext48
1389  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
1390  %vecext53 = extractelement <16 x i16> %b, i32 0
1391  %vecext55 = extractelement <16 x i16> %b, i32 1
1392  %add57 = add i16 %vecext53, %vecext55
1393  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
1394  %vecext60 = extractelement <16 x i16> %b, i32 2
1395  %vecext62 = extractelement <16 x i16> %b, i32 3
1396  %add64 = add i16 %vecext60, %vecext62
1397  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
1398  %vecext67 = extractelement <16 x i16> %b, i32 4
1399  %vecext69 = extractelement <16 x i16> %b, i32 5
1400  %add71 = add i16 %vecext67, %vecext69
1401  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
1402  %vecext74 = extractelement <16 x i16> %b, i32 6
1403  %vecext76 = extractelement <16 x i16> %b, i32 7
1404  %add78 = add i16 %vecext74, %vecext76
1405  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
1406  %vecext81 = extractelement <16 x i16> %b, i32 8
1407  %vecext83 = extractelement <16 x i16> %b, i32 9
1408  %add85 = add i16 %vecext81, %vecext83
1409  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
1410  %vecext88 = extractelement <16 x i16> %b, i32 10
1411  %vecext90 = extractelement <16 x i16> %b, i32 11
1412  %add92 = add i16 %vecext88, %vecext90
1413  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
1414  %vecext95 = extractelement <16 x i16> %b, i32 12
1415  %vecext97 = extractelement <16 x i16> %b, i32 13
1416  %add99 = add i16 %vecext95, %vecext97
1417  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
1418  %vecext102 = extractelement <16 x i16> %b, i32 14
1419  %vecext104 = extractelement <16 x i16> %b, i32 15
1420  %add106 = add i16 %vecext102, %vecext104
1421  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
1422  ret <16 x i16> %vecinit108
1423}
1424