xref: /llvm-project/llvm/test/CodeGen/X86/horizontal-shuffle.ll (revision 72e242a286be1c821c521fdc8a778517b193a59e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s
4
5;
6; 128-bit Vectors
7;
8
9define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
10; CHECK-LABEL: test_unpackl_fhadd_128:
11; CHECK:       ## %bb.0:
12; CHECK-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
13; CHECK-NEXT:    ret{{[l|q]}}
14  %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
15  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3)
16  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
17  ret <4 x float> %3
18}
19
20define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
21; CHECK-LABEL: test_unpackh_fhadd_128:
22; CHECK:       ## %bb.0:
23; CHECK-NEXT:    vhaddpd %xmm3, %xmm1, %xmm0
24; CHECK-NEXT:    ret{{[l|q]}}
25  %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
26  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3)
27  %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
28  ret <2 x double> %3
29}
30
31define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
32; CHECK-LABEL: test_unpackl_fhsub_128:
33; CHECK:       ## %bb.0:
34; CHECK-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
35; CHECK-NEXT:    ret{{[l|q]}}
36  %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
37  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3)
38  %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
39  ret <2 x double> %3
40}
41
42define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
43; CHECK-LABEL: test_unpackh_fhsub_128:
44; CHECK:       ## %bb.0:
45; CHECK-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
46; CHECK-NEXT:    ret{{[l|q]}}
47  %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
48  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3)
49  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
50  ret <4 x float> %3
51}
52
53define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
54; CHECK-LABEL: test_unpackl_hadd_128:
55; CHECK:       ## %bb.0:
56; CHECK-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
57; CHECK-NEXT:    ret{{[l|q]}}
58  %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
59  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
60  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
61  ret <8 x i16> %3
62}
63
64define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
65; CHECK-LABEL: test_unpackh_hadd_128:
66; CHECK:       ## %bb.0:
67; CHECK-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
68; CHECK-NEXT:    ret{{[l|q]}}
69  %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
70  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3)
71  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
72  ret <4 x i32> %3
73}
74
75define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
76; CHECK-LABEL: test_unpackl_hsub_128:
77; CHECK:       ## %bb.0:
78; CHECK-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
79; CHECK-NEXT:    ret{{[l|q]}}
80  %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
81  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3)
82  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
83  ret <4 x i32> %3
84}
85
86define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
87; CHECK-LABEL: test_unpackh_hsub_128:
88; CHECK:       ## %bb.0:
89; CHECK-NEXT:    vphsubw %xmm3, %xmm1, %xmm0
90; CHECK-NEXT:    ret{{[l|q]}}
91  %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
92  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3)
93  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
94  ret <8 x i16> %3
95}
96
97define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
98; CHECK-LABEL: test_unpackl_packss_128:
99; CHECK:       ## %bb.0:
100; CHECK-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
101; CHECK-NEXT:    ret{{[l|q]}}
102  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
103  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3)
104  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
105  ret <16 x i8> %3
106}
107
108define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
109; CHECK-LABEL: test_unpackh_packss_128:
110; CHECK:       ## %bb.0:
111; CHECK-NEXT:    vpackssdw %xmm3, %xmm1, %xmm0
112; CHECK-NEXT:    ret{{[l|q]}}
113  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
114  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
115  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
116  ret <8 x i16> %3
117}
118
119define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
120; CHECK-LABEL: test_unpackl_packus_128:
121; CHECK:       ## %bb.0:
122; CHECK-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
123; CHECK-NEXT:    ret{{[l|q]}}
124  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
125  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
126  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
127  ret <8 x i16> %3
128}
129
130define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
131; CHECK-LABEL: test_unpackh_packus_128:
132; CHECK:       ## %bb.0:
133; CHECK-NEXT:    vpackuswb %xmm3, %xmm1, %xmm0
134; CHECK-NEXT:    ret{{[l|q]}}
135  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
136  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
137  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
138  ret <16 x i8> %3
139}
140
141define <4 x float> @test_shufps_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
142; CHECK-LABEL: test_shufps_packss_128:
143; CHECK:       ## %bb.0:
144; CHECK-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
145; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
146; CHECK-NEXT:    ret{{[l|q]}}
147  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
148  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
149  %3 = bitcast <8 x i16> %1 to <4 x float>
150  %4 = bitcast <8 x i16> %2 to <4 x float>
151  %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 6, i32 6>
152  ret <4 x float> %5
153}
154
155define <4 x float> @test_shufps_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
156; CHECK-LABEL: test_shufps_packus_128:
157; CHECK:       ## %bb.0:
158; CHECK-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
159; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
160; CHECK-NEXT:    ret{{[l|q]}}
161  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
162  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
163  %3 = bitcast <16 x i8> %1 to <4 x float>
164  %4 = bitcast <16 x i8> %2 to <4 x float>
165  %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 1, i32 0, i32 4, i32 4>
166  ret <4 x float> %5
167}
168
169;
170; 256-bit Vectors
171;
172
173define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
174; CHECK-LABEL: test_unpackl_fhadd_256:
175; CHECK:       ## %bb.0:
176; CHECK-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
177; CHECK-NEXT:    ret{{[l|q]}}
178  %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
179  %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
180  %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
181  ret <8 x float> %3
182}
183
184define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
185; CHECK-LABEL: test_unpackh_fhadd_256:
186; CHECK:       ## %bb.0:
187; CHECK-NEXT:    vhaddpd %ymm3, %ymm1, %ymm0
188; CHECK-NEXT:    ret{{[l|q]}}
189  %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
190  %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3)
191  %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
192  ret <4 x double> %3
193}
194
195define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
196; CHECK-LABEL: test_unpackl_fhsub_256:
197; CHECK:       ## %bb.0:
198; CHECK-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
199; CHECK-NEXT:    ret{{[l|q]}}
200  %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
201  %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3)
202  %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
203  ret <4 x double> %3
204}
205
206define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
207; CHECK-LABEL: test_unpackh_fhsub_256:
208; CHECK:       ## %bb.0:
209; CHECK-NEXT:    vhsubps %ymm3, %ymm1, %ymm0
210; CHECK-NEXT:    ret{{[l|q]}}
211  %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
212  %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3)
213  %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
214  ret <8 x float> %3
215}
216
217define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
218; CHECK-LABEL: test_unpackl_hadd_256:
219; CHECK:       ## %bb.0:
220; CHECK-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
221; CHECK-NEXT:    ret{{[l|q]}}
222  %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
223  %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
224  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
225  ret <16 x i16> %3
226}
227
228define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
229; CHECK-LABEL: test_unpackh_hadd_256:
230; CHECK:       ## %bb.0:
231; CHECK-NEXT:    vphaddd %ymm3, %ymm1, %ymm0
232; CHECK-NEXT:    ret{{[l|q]}}
233  %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
234  %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3)
235  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
236  ret <8 x i32> %3
237}
238
239define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
240; CHECK-LABEL: test_unpackl_hsub_256:
241; CHECK:       ## %bb.0:
242; CHECK-NEXT:    vphsubd %ymm2, %ymm0, %ymm0
243; CHECK-NEXT:    ret{{[l|q]}}
244  %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
245  %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3)
246  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
247  ret <8 x i32> %3
248}
249
250define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
251; CHECK-LABEL: test_unpackh_hsub_256:
252; CHECK:       ## %bb.0:
253; CHECK-NEXT:    vphsubw %ymm3, %ymm1, %ymm0
254; CHECK-NEXT:    ret{{[l|q]}}
255  %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
256  %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3)
257  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
258  ret <16 x i16> %3
259}
260
261define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
262; CHECK-LABEL: test_unpackl_packss_256:
263; CHECK:       ## %bb.0:
264; CHECK-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
265; CHECK-NEXT:    ret{{[l|q]}}
266  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
267  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
268  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
269  ret <32 x i8> %3
270}
271
272define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
273; CHECK-LABEL: test_unpackh_packss_256:
274; CHECK:       ## %bb.0:
275; CHECK-NEXT:    vpackssdw %ymm3, %ymm1, %ymm0
276; CHECK-NEXT:    ret{{[l|q]}}
277  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
278  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
279  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
280  ret <16 x i16> %3
281}
282
283define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
284; CHECK-LABEL: test_unpackl_packus_256:
285; CHECK:       ## %bb.0:
286; CHECK-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
287; CHECK-NEXT:    ret{{[l|q]}}
288  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
289  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
290  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
291  ret <16 x i16> %3
292}
293
294define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
295; CHECK-LABEL: test_unpackh_packus_256:
296; CHECK:       ## %bb.0:
297; CHECK-NEXT:    vpacksswb %ymm3, %ymm1, %ymm0
298; CHECK-NEXT:    ret{{[l|q]}}
299  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
300  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
301  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
302  ret <32 x i8> %3
303}
304
305define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
306; CHECK-LABEL: test_shufps_packss_256:
307; CHECK:       ## %bb.0:
308; CHECK-NEXT:    vpackssdw %ymm3, %ymm0, %ymm0
309; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
310; CHECK-NEXT:    ret{{[l|q]}}
311  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
312  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
313  %3 = bitcast <16 x i16> %1 to <8 x float>
314  %4 = bitcast <16 x i16> %2 to <8 x float>
315  %5 = shufflevector <8 x float> %3, <8 x float> %4, <8 x i32> <i32 0, i32 1, i32 10, i32 10, i32 4, i32 5, i32 14, i32 14>
316  ret <8 x float> %5
317}
318
319define <8 x float> @test_shufps_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
320; CHECK-LABEL: test_shufps_packus_256:
321; CHECK:       ## %bb.0:
322; CHECK-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
323; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
324; CHECK-NEXT:    ret{{[l|q]}}
325  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
326  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a2, <16 x i16> %a3)
327  %3 = bitcast <32 x i8> %1 to <8 x float>
328  %4 = bitcast <32 x i8> %2 to <8 x float>
329  %5 = shufflevector <8 x float> %3, <8 x float> %4, <8 x i32> <i32 1, i32 0, i32 8, i32 8, i32 5, i32 4, i32 12, i32 12>
330  ret <8 x float> %5
331}
332
333declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
334declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
335declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
336declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
337
338declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
339declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
340declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
341declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
342
343declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
344declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
345declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
346declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
347
348declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
349declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
350declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
351declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
352
353declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
354declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
355declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
356declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
357
358declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
359declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
360declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
361declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
362