xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll (revision cf9b1f7a0e9da5d019a8bea853f3cff85d808d18)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
9
10define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) {
11; AVX1-LABEL: unpckh_unary_extracted_v4i64:
12; AVX1:       # %bb.0:
13; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
14; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
15; AVX1-NEXT:    vzeroupper
16; AVX1-NEXT:    retq
17;
18; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64:
19; AVX2OR512VL:       # %bb.0:
20; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
21; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
22; AVX2OR512VL-NEXT:    vzeroupper
23; AVX2OR512VL-NEXT:    retq
24  %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
25  %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
26  %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3>
27  ret <2 x i64> %r
28}
29
30define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
31; AVX1-LABEL: unpckh_unary_extracted_v8f64:
32; AVX1:       # %bb.0:
33; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
34; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
35; AVX1-NEXT:    vzeroupper
36; AVX1-NEXT:    retq
37;
38; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64:
39; AVX2OR512VL:       # %bb.0:
40; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
41; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
42; AVX2OR512VL-NEXT:    vzeroupper
43; AVX2OR512VL-NEXT:    retq
44  %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
45  %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
46  %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3>
47  ret <2 x double> %r
48}
49
50; vpermps requires a constant load for the index op. It's unlikely to be profitable.
51
52define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
53; ALL-LABEL: unpckh_unary_extracted_v8i32:
54; ALL:       # %bb.0:
55; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
56; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
57; ALL-NEXT:    vzeroupper
58; ALL-NEXT:    retq
59  %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60  %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
61  %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
62  ret <4 x i32> %r
63}
64
65define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
66; ALL-LABEL: unpckh_unary_extracted_v8f32:
67; ALL:       # %bb.0:
68; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
69; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
70; ALL-NEXT:    vzeroupper
71; ALL-NEXT:    retq
72  %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
73  %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
74  %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
75  ret <4 x float> %r
76}
77
78define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) {
79; AVX1-LABEL: unpckh_unary_extracted_v16i16:
80; AVX1:       # %bb.0:
81; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
82; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
83; AVX1-NEXT:    vzeroupper
84; AVX1-NEXT:    retq
85;
86; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16:
87; AVX2OR512VL:       # %bb.0:
88; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
89; AVX2OR512VL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
90; AVX2OR512VL-NEXT:    vzeroupper
91; AVX2OR512VL-NEXT:    retq
92  %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93  %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
94  %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
95  ret <8 x i16> %r
96}
97
98define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) {
99; AVX1-LABEL: unpckh_unary_extracted_v32i8:
100; AVX1:       # %bb.0:
101; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
102; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
103; AVX1-NEXT:    vzeroupper
104; AVX1-NEXT:    retq
105;
106; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8:
107; AVX2OR512VL:       # %bb.0:
108; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
109; AVX2OR512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110; AVX2OR512VL-NEXT:    vzeroupper
111; AVX2OR512VL-NEXT:    retq
112  %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
113  %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
114  %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
115  ret <16 x i8> %r
116}
117
118define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) {
119; AVX1-LABEL: unpckl_unary_extracted_v4i64:
120; AVX1:       # %bb.0:
121; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
122; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
123; AVX1-NEXT:    vzeroupper
124; AVX1-NEXT:    retq
125;
126; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64:
127; AVX2OR512VL:       # %bb.0:
128; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
129; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
130; AVX2OR512VL-NEXT:    vzeroupper
131; AVX2OR512VL-NEXT:    retq
132  %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
133  %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
134  %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2>
135  ret <2 x i64> %r
136}
137
138define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
139; AVX1-LABEL: unpckl_unary_extracted_v8f64:
140; AVX1:       # %bb.0:
141; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
142; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
143; AVX1-NEXT:    vzeroupper
144; AVX1-NEXT:    retq
145;
146; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64:
147; AVX2OR512VL:       # %bb.0:
148; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
149; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
150; AVX2OR512VL-NEXT:    vzeroupper
151; AVX2OR512VL-NEXT:    retq
152  %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
153  %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
154  %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2>
155  ret <2 x double> %r
156}
157
158; vpermps requires a constant load for the index op. It's unlikely to be profitable.
159
160define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
161; ALL-LABEL: unpckl_unary_extracted_v8i32:
162; ALL:       # %bb.0:
163; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
164; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
165; ALL-NEXT:    vzeroupper
166; ALL-NEXT:    retq
167  %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
168  %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169  %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
170  ret <4 x i32> %r
171}
172
173define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
174; ALL-LABEL: unpckl_unary_extracted_v8f32:
175; ALL:       # %bb.0:
176; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
177; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
178; ALL-NEXT:    vzeroupper
179; ALL-NEXT:    retq
180  %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
181  %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
182  %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
183  ret <4 x float> %r
184}
185
186define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) {
187; AVX1-LABEL: unpckl_unary_extracted_v16i16:
188; AVX1:       # %bb.0:
189; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
190; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
191; AVX1-NEXT:    vzeroupper
192; AVX1-NEXT:    retq
193;
194; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16:
195; AVX2OR512VL:       # %bb.0:
196; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
197; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
198; AVX2OR512VL-NEXT:    vzeroupper
199; AVX2OR512VL-NEXT:    retq
200  %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
201  %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
202  %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
203  ret <8 x i16> %r
204}
205
206define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
207; AVX1-LABEL: unpckl_unary_extracted_v32i8:
208; AVX1:       # %bb.0:
209; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
210; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
211; AVX1-NEXT:    vzeroupper
212; AVX1-NEXT:    retq
213;
214; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8:
215; AVX2OR512VL:       # %bb.0:
216; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
217; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
218; AVX2OR512VL-NEXT:    vzeroupper
219; AVX2OR512VL-NEXT:    retq
220  %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
221  %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
222  %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
223  ret <16 x i8> %r
224}
225
226; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
227
228define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
229; ALL-LABEL: extract_unpckl_v8i32:
230; ALL:       # %bb.0:
231; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
232; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
233; ALL-NEXT:    retq
234  %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
235  ret <8 x i32> %shuffle
236}
237
238