xref: /llvm-project/llvm/test/CodeGen/X86/phaddsub.ll (revision 62e36b1207497c4f7e1191a8d5407f4578c686e5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
10
11define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
12; SSSE3-LABEL: phaddw1:
13; SSSE3:       # %bb.0:
14; SSSE3-NEXT:    phaddw %xmm1, %xmm0
15; SSSE3-NEXT:    retq
16;
17; AVX-LABEL: phaddw1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
22  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23  %r = add <8 x i16> %a, %b
24  ret <8 x i16> %r
25}
26
27define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
28; SSSE3-LABEL: phaddw2:
29; SSSE3:       # %bb.0:
30; SSSE3-NEXT:    phaddw %xmm1, %xmm0
31; SSSE3-NEXT:    retq
32;
33; AVX-LABEL: phaddw2:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
36; AVX-NEXT:    retq
37  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
38  %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
39  %r = add <8 x i16> %a, %b
40  ret <8 x i16> %r
41}
42
43define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
44; SSSE3-LABEL: phaddd1:
45; SSSE3:       # %bb.0:
46; SSSE3-NEXT:    phaddd %xmm1, %xmm0
47; SSSE3-NEXT:    retq
48;
49; AVX-LABEL: phaddd1:
50; AVX:       # %bb.0:
51; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
52; AVX-NEXT:    retq
53  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
54  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
55  %r = add <4 x i32> %a, %b
56  ret <4 x i32> %r
57}
58
59define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
60; SSSE3-LABEL: phaddd2:
61; SSSE3:       # %bb.0:
62; SSSE3-NEXT:    phaddd %xmm1, %xmm0
63; SSSE3-NEXT:    retq
64;
65; AVX-LABEL: phaddd2:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    retq
69  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
70  %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
71  %r = add <4 x i32> %a, %b
72  ret <4 x i32> %r
73}
74
75define <4 x i32> @phaddd3(<4 x i32> %x) {
76; SSSE3-LABEL: phaddd3:
77; SSSE3:       # %bb.0:
78; SSSE3-NEXT:    phaddd %xmm0, %xmm0
79; SSSE3-NEXT:    retq
80;
81; AVX-LABEL: phaddd3:
82; AVX:       # %bb.0:
83; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
84; AVX-NEXT:    retq
85  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
86  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
87  %r = add <4 x i32> %a, %b
88  ret <4 x i32> %r
89}
90
91define <4 x i32> @phaddd4(<4 x i32> %x) {
92; SSSE3-LABEL: phaddd4:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    phaddd %xmm0, %xmm0
95; SSSE3-NEXT:    retq
96;
97; AVX-LABEL: phaddd4:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
100; AVX-NEXT:    retq
101  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
102  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
103  %r = add <4 x i32> %a, %b
104  ret <4 x i32> %r
105}
106
107define <4 x i32> @phaddd5(<4 x i32> %x) {
108; SSSE3-LABEL: phaddd5:
109; SSSE3:       # %bb.0:
110; SSSE3-NEXT:    phaddd %xmm0, %xmm0
111; SSSE3-NEXT:    retq
112;
113; AVX-LABEL: phaddd5:
114; AVX:       # %bb.0:
115; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
116; AVX-NEXT:    retq
117  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
118  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
119  %r = add <4 x i32> %a, %b
120  ret <4 x i32> %r
121}
122
123define <4 x i32> @phaddd6(<4 x i32> %x) {
124; SSSE3-SLOW-LABEL: phaddd6:
125; SSSE3-SLOW:       # %bb.0:
126; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
127; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
128; SSSE3-SLOW-NEXT:    retq
129;
130; SSSE3-FAST-LABEL: phaddd6:
131; SSSE3-FAST:       # %bb.0:
132; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
133; SSSE3-FAST-NEXT:    retq
134;
135; AVX-SLOW-LABEL: phaddd6:
136; AVX-SLOW:       # %bb.0:
137; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
138; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
139; AVX-SLOW-NEXT:    retq
140;
141; AVX-FAST-LABEL: phaddd6:
142; AVX-FAST:       # %bb.0:
143; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
144; AVX-FAST-NEXT:    retq
145;
146; AVX2-SHUF-LABEL: phaddd6:
147; AVX2-SHUF:       # %bb.0:
148; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
149; AVX2-SHUF-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
150; AVX2-SHUF-NEXT:    retq
151  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
152  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
153  %r = add <4 x i32> %a, %b
154  ret <4 x i32> %r
155}
156
157define <4 x i32> @phaddd7(<4 x i32> %x) {
158; SSSE3-LABEL: phaddd7:
159; SSSE3:       # %bb.0:
160; SSSE3-NEXT:    phaddd %xmm0, %xmm0
161; SSSE3-NEXT:    retq
162;
163; AVX-LABEL: phaddd7:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
166; AVX-NEXT:    retq
167  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
168  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
169  %r = add <4 x i32> %a, %b
170  ret <4 x i32> %r
171}
172
173define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
174; SSSE3-LABEL: phsubw1:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    phsubw %xmm1, %xmm0
177; SSSE3-NEXT:    retq
178;
179; AVX-LABEL: phsubw1:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
182; AVX-NEXT:    retq
183  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
184  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
185  %r = sub <8 x i16> %a, %b
186  ret <8 x i16> %r
187}
188
189define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
190; SSSE3-LABEL: phsubd1:
191; SSSE3:       # %bb.0:
192; SSSE3-NEXT:    phsubd %xmm1, %xmm0
193; SSSE3-NEXT:    retq
194;
195; AVX-LABEL: phsubd1:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
198; AVX-NEXT:    retq
199  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
200  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
201  %r = sub <4 x i32> %a, %b
202  ret <4 x i32> %r
203}
204
205define <4 x i32> @phsubd2(<4 x i32> %x) {
206; SSSE3-LABEL: phsubd2:
207; SSSE3:       # %bb.0:
208; SSSE3-NEXT:    phsubd %xmm0, %xmm0
209; SSSE3-NEXT:    retq
210;
211; AVX-LABEL: phsubd2:
212; AVX:       # %bb.0:
213; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
214; AVX-NEXT:    retq
215  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
216  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
217  %r = sub <4 x i32> %a, %b
218  ret <4 x i32> %r
219}
220
221define <4 x i32> @phsubd3(<4 x i32> %x) {
222; SSSE3-LABEL: phsubd3:
223; SSSE3:       # %bb.0:
224; SSSE3-NEXT:    phsubd %xmm0, %xmm0
225; SSSE3-NEXT:    retq
226;
227; AVX-LABEL: phsubd3:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
230; AVX-NEXT:    retq
231  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
232  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
233  %r = sub <4 x i32> %a, %b
234  ret <4 x i32> %r
235}
236
237define <4 x i32> @phsubd4(<4 x i32> %x) {
238; SSSE3-SLOW-LABEL: phsubd4:
239; SSSE3-SLOW:       # %bb.0:
240; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
241; SSSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
242; SSSE3-SLOW-NEXT:    retq
243;
244; SSSE3-FAST-LABEL: phsubd4:
245; SSSE3-FAST:       # %bb.0:
246; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
247; SSSE3-FAST-NEXT:    retq
248;
249; AVX-SLOW-LABEL: phsubd4:
250; AVX-SLOW:       # %bb.0:
251; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
252; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
253; AVX-SLOW-NEXT:    retq
254;
255; AVX-FAST-LABEL: phsubd4:
256; AVX-FAST:       # %bb.0:
257; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
258; AVX-FAST-NEXT:    retq
259;
260; AVX2-SHUF-LABEL: phsubd4:
261; AVX2-SHUF:       # %bb.0:
262; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
263; AVX2-SHUF-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
264; AVX2-SHUF-NEXT:    retq
265  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
266  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
267  %r = sub <4 x i32> %a, %b
268  ret <4 x i32> %r
269}
270
271define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
272; SSSE3-LABEL: phsubw1_reverse:
273; SSSE3:       # %bb.0:
274; SSSE3-NEXT:    movdqa %xmm1, %xmm3
275; SSSE3-NEXT:    psrad $16, %xmm3
276; SSSE3-NEXT:    movdqa %xmm0, %xmm2
277; SSSE3-NEXT:    psrad $16, %xmm2
278; SSSE3-NEXT:    packssdw %xmm3, %xmm2
279; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
280; SSSE3-NEXT:    pshufb %xmm3, %xmm1
281; SSSE3-NEXT:    pshufb %xmm3, %xmm0
282; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
283; SSSE3-NEXT:    psubw %xmm0, %xmm2
284; SSSE3-NEXT:    movdqa %xmm2, %xmm0
285; SSSE3-NEXT:    retq
286;
287; AVX-LABEL: phsubw1_reverse:
288; AVX:       # %bb.0:
289; AVX-NEXT:    vpsrld $16, %xmm1, %xmm2
290; AVX-NEXT:    vpsrld $16, %xmm0, %xmm3
291; AVX-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
292; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
293; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
294; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
295; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
296; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
297; AVX-NEXT:    retq
298  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
299  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
300  %r = sub <8 x i16> %a, %b
301  ret <8 x i16> %r
302}
303
304define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
305; SSSE3-LABEL: phsubd1_reverse:
306; SSSE3:       # %bb.0:
307; SSSE3-NEXT:    movaps %xmm0, %xmm2
308; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
309; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
310; SSSE3-NEXT:    psubd %xmm0, %xmm2
311; SSSE3-NEXT:    movdqa %xmm2, %xmm0
312; SSSE3-NEXT:    retq
313;
314; AVX-LABEL: phsubd1_reverse:
315; AVX:       # %bb.0:
316; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
317; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
318; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
319; AVX-NEXT:    retq
320  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
321  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
322  %r = sub <4 x i32> %a, %b
323  ret <4 x i32> %r
324}
325
326define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
327; SSSE3-LABEL: phaddd_single_source1:
328; SSSE3:       # %bb.0:
329; SSSE3-NEXT:    phaddd %xmm0, %xmm0
330; SSSE3-NEXT:    retq
331;
332; AVX-LABEL: phaddd_single_source1:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
335; AVX-NEXT:    retq
336  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
337  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
338  %add = add <4 x i32> %l, %r
339  ret <4 x i32> %add
340}
341
342define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
343; SSSE3-SLOW-LABEL: phaddd_single_source2:
344; SSSE3-SLOW:       # %bb.0:
345; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
346; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
347; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
348; SSSE3-SLOW-NEXT:    retq
349;
350; SSSE3-FAST-LABEL: phaddd_single_source2:
351; SSSE3-FAST:       # %bb.0:
352; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
353; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
354; SSSE3-FAST-NEXT:    retq
355;
356; AVX-SLOW-LABEL: phaddd_single_source2:
357; AVX-SLOW:       # %bb.0:
358; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
359; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
360; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
361; AVX-SLOW-NEXT:    retq
362;
363; AVX-FAST-LABEL: phaddd_single_source2:
364; AVX-FAST:       # %bb.0:
365; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
366; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
367; AVX-FAST-NEXT:    retq
368;
369; AVX2-SHUF-LABEL: phaddd_single_source2:
370; AVX2-SHUF:       # %bb.0:
371; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
372; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
373; AVX2-SHUF-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
374; AVX2-SHUF-NEXT:    retq
375  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
376  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
377  %add = add <4 x i32> %l, %r
378  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
379  ret <4 x i32> %shuffle2
380}
381
382define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
383; SSSE3-LABEL: phaddd_single_source3:
384; SSSE3:       # %bb.0:
385; SSSE3-NEXT:    phaddd %xmm0, %xmm0
386; SSSE3-NEXT:    retq
387;
388; AVX-LABEL: phaddd_single_source3:
389; AVX:       # %bb.0:
390; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
391; AVX-NEXT:    retq
392  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
393  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
394  %add = add <4 x i32> %l, %r
395  ret <4 x i32> %add
396}
397
398define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
399; SSSE3-SLOW-LABEL: phaddd_single_source4:
400; SSSE3-SLOW:       # %bb.0:
401; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
402; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
403; SSSE3-SLOW-NEXT:    retq
404;
405; SSSE3-FAST-LABEL: phaddd_single_source4:
406; SSSE3-FAST:       # %bb.0:
407; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
408; SSSE3-FAST-NEXT:    retq
409;
410; AVX-SLOW-LABEL: phaddd_single_source4:
411; AVX-SLOW:       # %bb.0:
412; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
413; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
414; AVX-SLOW-NEXT:    retq
415;
416; AVX-FAST-LABEL: phaddd_single_source4:
417; AVX-FAST:       # %bb.0:
418; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
419; AVX-FAST-NEXT:    retq
420;
421; AVX2-SHUF-LABEL: phaddd_single_source4:
422; AVX2-SHUF:       # %bb.0:
423; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
424; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
425; AVX2-SHUF-NEXT:    retq
426  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
427  %add = add <4 x i32> %l, %x
428  ret <4 x i32> %add
429}
430
431define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
432; SSSE3-SLOW-LABEL: phaddd_single_source5:
433; SSSE3-SLOW:       # %bb.0:
434; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
435; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
436; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
437; SSSE3-SLOW-NEXT:    retq
438;
439; SSSE3-FAST-LABEL: phaddd_single_source5:
440; SSSE3-FAST:       # %bb.0:
441; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
442; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
443; SSSE3-FAST-NEXT:    retq
444;
445; AVX-SLOW-LABEL: phaddd_single_source5:
446; AVX-SLOW:       # %bb.0:
447; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
448; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
449; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
450; AVX-SLOW-NEXT:    retq
451;
452; AVX-FAST-LABEL: phaddd_single_source5:
453; AVX-FAST:       # %bb.0:
454; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
455; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
456; AVX-FAST-NEXT:    retq
457;
458; AVX2-SHUF-LABEL: phaddd_single_source5:
459; AVX2-SHUF:       # %bb.0:
460; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
461; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
462; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
463; AVX2-SHUF-NEXT:    retq
464  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
465  %add = add <4 x i32> %l, %x
466  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
467  ret <4 x i32> %shuffle2
468}
469
470define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
471; SSSE3-LABEL: phaddd_single_source6:
472; SSSE3:       # %bb.0:
473; SSSE3-NEXT:    phaddd %xmm0, %xmm0
474; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
475; SSSE3-NEXT:    retq
476;
477; AVX-LABEL: phaddd_single_source6:
478; AVX:       # %bb.0:
479; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
480; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
481; AVX-NEXT:    retq
482  %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
483  %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
484  %add = add <4 x i32> %l, %r
485  %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
486  ret <4 x i32> %shuffle2
487}
488
489define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
490; SSSE3-LABEL: phaddw_single_source1:
491; SSSE3:       # %bb.0:
492; SSSE3-NEXT:    phaddw %xmm0, %xmm0
493; SSSE3-NEXT:    retq
494;
495; AVX-LABEL: phaddw_single_source1:
496; AVX:       # %bb.0:
497; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
498; AVX-NEXT:    retq
499  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
500  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
501  %add = add <8 x i16> %l, %r
502  ret <8 x i16> %add
503}
504
505define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
506; SSSE3-SLOW-LABEL: phaddw_single_source2:
507; SSSE3-SLOW:       # %bb.0:
508; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
509; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
510; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
511; SSSE3-SLOW-NEXT:    retq
512;
513; SSSE3-FAST-LABEL: phaddw_single_source2:
514; SSSE3-FAST:       # %bb.0:
515; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
516; SSSE3-FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
517; SSSE3-FAST-NEXT:    retq
518;
519; AVX-SLOW-LABEL: phaddw_single_source2:
520; AVX-SLOW:       # %bb.0:
521; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
522; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
523; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
524; AVX-SLOW-NEXT:    retq
525;
526; AVX-FAST-LABEL: phaddw_single_source2:
527; AVX-FAST:       # %bb.0:
528; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
529; AVX-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
530; AVX-FAST-NEXT:    retq
531;
532; AVX2-SHUF-LABEL: phaddw_single_source2:
533; AVX2-SHUF:       # %bb.0:
534; AVX2-SHUF-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
535; AVX2-SHUF-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
536; AVX2-SHUF-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
537; AVX2-SHUF-NEXT:    retq
538  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
539  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
540  %add = add <8 x i16> %l, %r
541  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
542  ret <8 x i16> %shuffle2
543}
544
545define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
546; SSSE3-LABEL: phaddw_single_source3:
547; SSSE3:       # %bb.0:
548; SSSE3-NEXT:    phaddw %xmm0, %xmm0
549; SSSE3-NEXT:    retq
550;
551; AVX-LABEL: phaddw_single_source3:
552; AVX:       # %bb.0:
553; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
554; AVX-NEXT:    retq
555  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
556  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
557  %add = add <8 x i16> %l, %r
558  ret <8 x i16> %add
559}
560
561define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
562; SSSE3-SLOW-LABEL: phaddw_single_source4:
563; SSSE3-SLOW:       # %bb.0:
564; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
565; SSSE3-SLOW-NEXT:    pslld $16, %xmm1
566; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
567; SSSE3-SLOW-NEXT:    retq
568;
569; SSSE3-FAST-LABEL: phaddw_single_source4:
570; SSSE3-FAST:       # %bb.0:
571; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
572; SSSE3-FAST-NEXT:    retq
573;
574; AVX-SLOW-LABEL: phaddw_single_source4:
575; AVX-SLOW:       # %bb.0:
576; AVX-SLOW-NEXT:    vpslld $16, %xmm0, %xmm1
577; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
578; AVX-SLOW-NEXT:    retq
579;
580; AVX-FAST-LABEL: phaddw_single_source4:
581; AVX-FAST:       # %bb.0:
582; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
583; AVX-FAST-NEXT:    retq
584;
585; AVX2-SHUF-LABEL: phaddw_single_source4:
586; AVX2-SHUF:       # %bb.0:
587; AVX2-SHUF-NEXT:    vpslld $16, %xmm0, %xmm1
588; AVX2-SHUF-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
589; AVX2-SHUF-NEXT:    retq
590  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
591  %add = add <8 x i16> %l, %x
592  ret <8 x i16> %add
593}
594
595define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
596; SSSE3-LABEL: phaddw_single_source6:
597; SSSE3:       # %bb.0:
598; SSSE3-NEXT:    phaddw %xmm0, %xmm0
599; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
600; SSSE3-NEXT:    retq
601;
602; AVX-LABEL: phaddw_single_source6:
603; AVX:       # %bb.0:
604; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
605; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
606; AVX-NEXT:    retq
607  %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
608  %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
609  %add = add <8 x i16> %l, %r
610  %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
611  ret <8 x i16> %shuffle2
612}
613
614; PR39921 + PR39936
615define i32 @PR39936_v8i32(<8 x i32>) {
616; SSSE3-SLOW-LABEL: PR39936_v8i32:
617; SSSE3-SLOW:       # %bb.0:
618; SSSE3-SLOW-NEXT:    phaddd %xmm1, %xmm0
619; SSSE3-SLOW-NEXT:    phaddd %xmm0, %xmm0
620; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
621; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
622; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
623; SSSE3-SLOW-NEXT:    retq
624;
625; SSSE3-FAST-LABEL: PR39936_v8i32:
626; SSSE3-FAST:       # %bb.0:
627; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
628; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
629; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
630; SSSE3-FAST-NEXT:    movd %xmm0, %eax
631; SSSE3-FAST-NEXT:    retq
632;
633; AVX1-SLOW-LABEL: PR39936_v8i32:
634; AVX1-SLOW:       # %bb.0:
635; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
636; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
637; AVX1-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
638; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
639; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
640; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
641; AVX1-SLOW-NEXT:    vzeroupper
642; AVX1-SLOW-NEXT:    retq
643;
644; AVX1-FAST-LABEL: PR39936_v8i32:
645; AVX1-FAST:       # %bb.0:
646; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
647; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
648; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
649; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
650; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
651; AVX1-FAST-NEXT:    vzeroupper
652; AVX1-FAST-NEXT:    retq
653;
654; AVX2-SLOW-LABEL: PR39936_v8i32:
655; AVX2-SLOW:       # %bb.0:
656; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
657; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
658; AVX2-SLOW-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
659; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
660; AVX2-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
661; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
662; AVX2-SLOW-NEXT:    vzeroupper
663; AVX2-SLOW-NEXT:    retq
664;
665; AVX2-FAST-LABEL: PR39936_v8i32:
666; AVX2-FAST:       # %bb.0:
667; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
668; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
669; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
670; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
671; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
672; AVX2-FAST-NEXT:    vzeroupper
673; AVX2-FAST-NEXT:    retq
674;
675; AVX2-SHUF-LABEL: PR39936_v8i32:
676; AVX2-SHUF:       # %bb.0:
677; AVX2-SHUF-NEXT:    vextracti128 $1, %ymm0, %xmm1
678; AVX2-SHUF-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
679; AVX2-SHUF-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
680; AVX2-SHUF-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
681; AVX2-SHUF-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
682; AVX2-SHUF-NEXT:    vmovd %xmm0, %eax
683; AVX2-SHUF-NEXT:    vzeroupper
684; AVX2-SHUF-NEXT:    retq
685  %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
686  %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
687  %4 = add <8 x i32> %2, %3
688  %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
689  %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
690  %7 = add <8 x i32> %5, %6
691  %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
692  %9 = add <8 x i32> %8, %7
693  %10 = extractelement <8 x i32> %9, i32 0
694  ret i32 %10
695}
696
697