xref: /llvm-project/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll (revision 69a322fed19b977d15be9500d8653496b73673e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6
7define <4 x float> @test_unpacklo_hadd_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
8; SSE-LABEL: test_unpacklo_hadd_v4f32:
9; SSE:       ## %bb.0:
10; SSE-NEXT:    haddps %xmm2, %xmm0
11; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
12; SSE-NEXT:    ret{{[l|q]}}
13;
14; AVX-LABEL: test_unpacklo_hadd_v4f32:
15; AVX:       ## %bb.0:
16; AVX-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
17; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
18; AVX-NEXT:    ret{{[l|q]}}
19  %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4
20  %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4
21  %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
22  ret <4 x float> %7
23}
24
25define <4 x float> @test_unpackhi_hadd_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
26; SSE-LABEL: test_unpackhi_hadd_v4f32:
27; SSE:       ## %bb.0:
28; SSE-NEXT:    movaps %xmm1, %xmm0
29; SSE-NEXT:    haddps %xmm3, %xmm0
30; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
31; SSE-NEXT:    ret{{[l|q]}}
32;
33; AVX-LABEL: test_unpackhi_hadd_v4f32:
34; AVX:       ## %bb.0:
35; AVX-NEXT:    vhaddps %xmm3, %xmm1, %xmm0
36; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
37; AVX-NEXT:    ret{{[l|q]}}
38  %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4
39  %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4
40  %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
41  ret <4 x float> %7
42}
43
44define <4 x float> @test_unpacklo_hsub_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
45; SSE-LABEL: test_unpacklo_hsub_v4f32:
46; SSE:       ## %bb.0:
47; SSE-NEXT:    hsubps %xmm2, %xmm0
48; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
49; SSE-NEXT:    ret{{[l|q]}}
50;
51; AVX-LABEL: test_unpacklo_hsub_v4f32:
52; AVX:       ## %bb.0:
53; AVX-NEXT:    vhsubps %xmm2, %xmm0, %xmm0
54; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
55; AVX-NEXT:    ret{{[l|q]}}
56  %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4
57  %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4
58  %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
59  ret <4 x float> %7
60}
61
62define <4 x float> @test_unpackhi_hsub_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
63; SSE-LABEL: test_unpackhi_hsub_v4f32:
64; SSE:       ## %bb.0:
65; SSE-NEXT:    movaps %xmm1, %xmm0
66; SSE-NEXT:    hsubps %xmm3, %xmm0
67; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
68; SSE-NEXT:    ret{{[l|q]}}
69;
70; AVX-LABEL: test_unpackhi_hsub_v4f32:
71; AVX:       ## %bb.0:
72; AVX-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
73; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
74; AVX-NEXT:    ret{{[l|q]}}
75  %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4
76  %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4
77  %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
78  ret <4 x float> %7
79}
80
81define <4 x i32> @test_unpacklo_hadd_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
82; SSE-LABEL: test_unpacklo_hadd_v4i32:
83; SSE:       ## %bb.0:
84; SSE-NEXT:    phaddd %xmm2, %xmm0
85; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
86; SSE-NEXT:    ret{{[l|q]}}
87;
88; AVX-LABEL: test_unpacklo_hadd_v4i32:
89; AVX:       ## %bb.0:
90; AVX-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
91; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
92; AVX-NEXT:    ret{{[l|q]}}
93  %5 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %0, <4 x i32> %1) #5
94  %6 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %2, <4 x i32> %3) #5
95  %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
96  ret <4 x i32> %7
97}
98
99define <4 x i32> @test_unpackhi_hadd_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
100; SSE-LABEL: test_unpackhi_hadd_v4i32:
101; SSE:       ## %bb.0:
102; SSE-NEXT:    phaddd %xmm3, %xmm1
103; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
104; SSE-NEXT:    ret{{[l|q]}}
105;
106; AVX-LABEL: test_unpackhi_hadd_v4i32:
107; AVX:       ## %bb.0:
108; AVX-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
109; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
110; AVX-NEXT:    ret{{[l|q]}}
111  %5 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %0, <4 x i32> %1) #5
112  %6 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %2, <4 x i32> %3) #5
113  %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
114  ret <4 x i32> %7
115}
116
117define <4 x i32> @test_unpacklo_hsub_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
118; SSE-LABEL: test_unpacklo_hsub_v4i32:
119; SSE:       ## %bb.0:
120; SSE-NEXT:    phsubd %xmm2, %xmm0
121; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
122; SSE-NEXT:    ret{{[l|q]}}
123;
124; AVX-LABEL: test_unpacklo_hsub_v4i32:
125; AVX:       ## %bb.0:
126; AVX-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
127; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
128; AVX-NEXT:    ret{{[l|q]}}
129  %5 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %0, <4 x i32> %1) #5
130  %6 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %2, <4 x i32> %3) #5
131  %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
132  ret <4 x i32> %7
133}
134
135define <4 x i32> @test_unpackhi_hsub_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
136; SSE-LABEL: test_unpackhi_hsub_v4i32:
137; SSE:       ## %bb.0:
138; SSE-NEXT:    phsubd %xmm3, %xmm1
139; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
140; SSE-NEXT:    ret{{[l|q]}}
141;
142; AVX-LABEL: test_unpackhi_hsub_v4i32:
143; AVX:       ## %bb.0:
144; AVX-NEXT:    vphsubd %xmm3, %xmm1, %xmm0
145; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
146; AVX-NEXT:    ret{{[l|q]}}
147  %5 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %0, <4 x i32> %1) #5
148  %6 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %2, <4 x i32> %3) #5
149  %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
150  ret <4 x i32> %7
151}
152
153;
154; Special Case
155;
156
157define <4 x float> @test_unpacklo_hadd_v4f32_unary(<4 x float> %0) {
158; SSE-LABEL: test_unpacklo_hadd_v4f32_unary:
159; SSE:       ## %bb.0:
160; SSE-NEXT:    haddps %xmm0, %xmm0
161; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
162; SSE-NEXT:    ret{{[l|q]}}
163;
164; AVX-LABEL: test_unpacklo_hadd_v4f32_unary:
165; AVX:       ## %bb.0:
166; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
167; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
168; AVX-NEXT:    ret{{[l|q]}}
169  %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) #4
170  %3 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
171  ret <4 x float> %3
172}
173
174define <8 x i16> @PR51974(<8 x i16> %a0) {
175; SSE-LABEL: PR51974:
176; SSE:       ## %bb.0:
177; SSE-NEXT:    movdqa %xmm0, %xmm1
178; SSE-NEXT:    phaddw %xmm0, %xmm1
179; SSE-NEXT:    phaddw %xmm0, %xmm1
180; SSE-NEXT:    movdqa %xmm1, %xmm0
181; SSE-NEXT:    ret{{[l|q]}}
182;
183; AVX-LABEL: PR51974:
184; AVX:       ## %bb.0:
185; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
186; AVX-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
187; AVX-NEXT:    ret{{[l|q]}}
188  %r0 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a0)
189  %r1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r0, <8 x i16> %a0)
190  ret <8 x i16> %r1
191}
192
193define <8 x i16> @PR52040(<8 x i16> %a0) {
194; SSE-LABEL: PR52040:
195; SSE:       ## %bb.0:
196; SSE-NEXT:    phaddw %xmm0, %xmm0
197; SSE-NEXT:    movdqa %xmm0, %xmm1
198; SSE-NEXT:    phaddw %xmm0, %xmm1
199; SSE-NEXT:    phaddw %xmm0, %xmm1
200; SSE-NEXT:    movdqa %xmm1, %xmm0
201; SSE-NEXT:    ret{{[l|q]}}
202;
203; AVX-LABEL: PR52040:
204; AVX:       ## %bb.0:
205; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
206; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
207; AVX-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
208; AVX-NEXT:    ret{{[l|q]}}
209  %r1 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a0)
210  %r2 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r1, <8 x i16> %r1)
211  %r3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r2, <8 x i16> %r1)
212  ret <8 x i16> %r3
213}
214
215declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
216declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
217declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
218declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
219
220declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
221declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
222declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
223declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
224
225declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
226declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
227declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
228declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
229