xref: /llvm-project/llvm/test/CodeGen/X86/vector-pack-128.ll (revision e6bf48d11047e970cb24554a01b65b566d6b5d22)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
8
9; trunc(concat(x,y)) -> pack
10
11define <8 x i16> @trunc_concat_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
12; SSE-LABEL: trunc_concat_packssdw_128:
13; SSE:       # %bb.0:
14; SSE-NEXT:    psrad $17, %xmm0
15; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
16; SSE-NEXT:    packssdw %xmm1, %xmm0
17; SSE-NEXT:    retq
18;
19; AVX1-LABEL: trunc_concat_packssdw_128:
20; AVX1:       # %bb.0:
21; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
22; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
23; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
24; AVX1-NEXT:    retq
25;
26; AVX2-LABEL: trunc_concat_packssdw_128:
27; AVX2:       # %bb.0:
28; AVX2-NEXT:    vpsrad $17, %xmm0, %xmm0
29; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
30; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
31; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
32; AVX2-NEXT:    retq
33;
34; AVX512-LABEL: trunc_concat_packssdw_128:
35; AVX512:       # %bb.0:
36; AVX512-NEXT:    vpsrad $17, %xmm0, %xmm0
37; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
38; AVX512-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
39; AVX512-NEXT:    retq
40  %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
41  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
42  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
43  %4 = trunc <8 x i32> %3 to <8 x i16>
44  ret <8 x i16> %4
45}
46
47define <8 x i16> @trunc_concat_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
48; SSE2-LABEL: trunc_concat_packusdw_128:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    psrld $17, %xmm0
51; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
52; SSE2-NEXT:    packssdw %xmm1, %xmm0
53; SSE2-NEXT:    retq
54;
55; SSE4-LABEL: trunc_concat_packusdw_128:
56; SSE4:       # %bb.0:
57; SSE4-NEXT:    psrld $17, %xmm0
58; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
59; SSE4-NEXT:    packusdw %xmm1, %xmm0
60; SSE4-NEXT:    retq
61;
62; AVX1-LABEL: trunc_concat_packusdw_128:
63; AVX1:       # %bb.0:
64; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
65; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
66; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
67; AVX1-NEXT:    retq
68;
69; AVX2-LABEL: trunc_concat_packusdw_128:
70; AVX2:       # %bb.0:
71; AVX2-NEXT:    vpsrld $17, %xmm0, %xmm0
72; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
73; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
74; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
75; AVX2-NEXT:    retq
76;
77; AVX512-LABEL: trunc_concat_packusdw_128:
78; AVX512:       # %bb.0:
79; AVX512-NEXT:    vpsrld $17, %xmm0, %xmm0
80; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
81; AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
82; AVX512-NEXT:    retq
83  %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
84  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
85  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
86  %4 = trunc <8 x i32> %3 to <8 x i16>
87  ret <8 x i16> %4
88}
89
90define <16 x i8> @trunc_concat_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
91; SSE-LABEL: trunc_concat_packsswb_128:
92; SSE:       # %bb.0:
93; SSE-NEXT:    psraw $15, %xmm0
94; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
95; SSE-NEXT:    packsswb %xmm1, %xmm0
96; SSE-NEXT:    retq
97;
98; AVX1-LABEL: trunc_concat_packsswb_128:
99; AVX1:       # %bb.0:
100; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
101; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
102; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
103; AVX1-NEXT:    retq
104;
105; AVX2-LABEL: trunc_concat_packsswb_128:
106; AVX2:       # %bb.0:
107; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
108; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
109; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
110; AVX2-NEXT:    retq
111;
112; AVX512-LABEL: trunc_concat_packsswb_128:
113; AVX512:       # %bb.0:
114; AVX512-NEXT:    vpsraw $15, %xmm0, %xmm0
115; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
116; AVX512-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
117; AVX512-NEXT:    retq
118  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
119  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
120  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
121  %4 = trunc <16 x i16> %3 to <16 x i8>
122  ret <16 x i8> %4
123}
124
125define <16 x i8> @trunc_concat_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
126; SSE-LABEL: trunc_concat_packuswb_128:
127; SSE:       # %bb.0:
128; SSE-NEXT:    psrlw $15, %xmm0
129; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130; SSE-NEXT:    packuswb %xmm1, %xmm0
131; SSE-NEXT:    retq
132;
133; AVX1-LABEL: trunc_concat_packuswb_128:
134; AVX1:       # %bb.0:
135; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
136; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
137; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
138; AVX1-NEXT:    retq
139;
140; AVX2-LABEL: trunc_concat_packuswb_128:
141; AVX2:       # %bb.0:
142; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
143; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
144; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
145; AVX2-NEXT:    retq
146;
147; AVX512-LABEL: trunc_concat_packuswb_128:
148; AVX512:       # %bb.0:
149; AVX512-NEXT:    vpsrlw $15, %xmm0, %xmm0
150; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
151; AVX512-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
152; AVX512-NEXT:    retq
153  %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
154  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
155  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
156  %4 = trunc <16 x i16> %3 to <16 x i8>
157  ret <16 x i8> %4
158}
159
160; concat(trunc(x),trunc(y)) -> pack
161
162define <8 x i16> @concat_trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
163; SSE2-LABEL: concat_trunc_packssdw_128:
164; SSE2:       # %bb.0:
165; SSE2-NEXT:    psrad $17, %xmm0
166; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
167; SSE2-NEXT:    packssdw %xmm0, %xmm0
168; SSE2-NEXT:    packuswb %xmm1, %xmm1
169; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
170; SSE2-NEXT:    retq
171;
172; SSE4-LABEL: concat_trunc_packssdw_128:
173; SSE4:       # %bb.0:
174; SSE4-NEXT:    psrad $17, %xmm0
175; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
176; SSE4-NEXT:    packssdw %xmm1, %xmm0
177; SSE4-NEXT:    retq
178;
179; AVX1-LABEL: concat_trunc_packssdw_128:
180; AVX1:       # %bb.0:
181; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
182; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
183; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
184; AVX1-NEXT:    retq
185;
186; AVX2-LABEL: concat_trunc_packssdw_128:
187; AVX2:       # %bb.0:
188; AVX2-NEXT:    vpsrad $17, %xmm0, %xmm0
189; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
190; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
191; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
192; AVX2-NEXT:    retq
193;
194; AVX512-LABEL: concat_trunc_packssdw_128:
195; AVX512:       # %bb.0:
196; AVX512-NEXT:    vpsrad $17, %xmm0, %xmm0
197; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
198; AVX512-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
199; AVX512-NEXT:    retq
200  %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
201  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
202  %3 = trunc <4 x i32> %1 to <4 x i16>
203  %4 = trunc <4 x i32> %2 to <4 x i16>
204  %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
205  ret <8 x i16> %5
206}
207
208define <8 x i16> @concat_trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
209; SSE2-LABEL: concat_trunc_packusdw_128:
210; SSE2:       # %bb.0:
211; SSE2-NEXT:    psrld $17, %xmm0
212; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
213; SSE2-NEXT:    packssdw %xmm0, %xmm0
214; SSE2-NEXT:    packuswb %xmm1, %xmm1
215; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
216; SSE2-NEXT:    retq
217;
218; SSE4-LABEL: concat_trunc_packusdw_128:
219; SSE4:       # %bb.0:
220; SSE4-NEXT:    psrld $17, %xmm0
221; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
222; SSE4-NEXT:    packusdw %xmm1, %xmm0
223; SSE4-NEXT:    retq
224;
225; AVX1-LABEL: concat_trunc_packusdw_128:
226; AVX1:       # %bb.0:
227; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
228; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
229; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
230; AVX1-NEXT:    retq
231;
232; AVX2-LABEL: concat_trunc_packusdw_128:
233; AVX2:       # %bb.0:
234; AVX2-NEXT:    vpsrld $17, %xmm0, %xmm0
235; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
236; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
237; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
238; AVX2-NEXT:    retq
239;
240; AVX512-LABEL: concat_trunc_packusdw_128:
241; AVX512:       # %bb.0:
242; AVX512-NEXT:    vpsrld $17, %xmm0, %xmm0
243; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
244; AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
245; AVX512-NEXT:    retq
246  %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
247  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
248  %3 = trunc <4 x i32> %1 to <4 x i16>
249  %4 = trunc <4 x i32> %2 to <4 x i16>
250  %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
251  ret <8 x i16> %5
252}
253
254define <16 x i8> @concat_trunc_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
255; SSE-LABEL: concat_trunc_packsswb_128:
256; SSE:       # %bb.0:
257; SSE-NEXT:    psraw $15, %xmm0
258; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
259; SSE-NEXT:    packsswb %xmm1, %xmm0
260; SSE-NEXT:    retq
261;
262; AVX1-LABEL: concat_trunc_packsswb_128:
263; AVX1:       # %bb.0:
264; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
265; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
266; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
267; AVX1-NEXT:    retq
268;
269; AVX2-LABEL: concat_trunc_packsswb_128:
270; AVX2:       # %bb.0:
271; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
272; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
273; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
274; AVX2-NEXT:    retq
275;
276; AVX512-LABEL: concat_trunc_packsswb_128:
277; AVX512:       # %bb.0:
278; AVX512-NEXT:    vpsraw $15, %xmm0, %xmm0
279; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
280; AVX512-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
281; AVX512-NEXT:    retq
282  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
283  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
284  %3 = trunc <8 x i16> %1 to <8 x i8>
285  %4 = trunc <8 x i16> %2 to <8 x i8>
286  %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
287  ret <16 x i8> %5
288}
289
290define <16 x i8> @concat_trunc_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
291; SSE-LABEL: concat_trunc_packuswb_128:
292; SSE:       # %bb.0:
293; SSE-NEXT:    psrlw $15, %xmm0
294; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
295; SSE-NEXT:    packuswb %xmm1, %xmm0
296; SSE-NEXT:    retq
297;
298; AVX1-LABEL: concat_trunc_packuswb_128:
299; AVX1:       # %bb.0:
300; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
301; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
302; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
303; AVX1-NEXT:    retq
304;
305; AVX2-LABEL: concat_trunc_packuswb_128:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
308; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
309; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
310; AVX2-NEXT:    retq
311;
312; AVX512-LABEL: concat_trunc_packuswb_128:
313; AVX512:       # %bb.0:
314; AVX512-NEXT:    vpsrlw $15, %xmm0, %xmm0
315; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
316; AVX512-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
317; AVX512-NEXT:    retq
318  %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
319  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
320  %3 = trunc <8 x i16> %1 to <8 x i8>
321  %4 = trunc <8 x i16> %2 to <8 x i8>
322  %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
323  ret <16 x i8> %5
324}
325
326; Fuzz test - don't pack a v1i32 comparison result.
327define void @autogen_SD10339(<1 x i32> %I49) {
328; CHECK-LABEL: autogen_SD10339:
329; CHECK:       # %bb.0: # %BB
330; CHECK-NEXT:    .p2align 4
331; CHECK-NEXT:  .LBB8_1: # %CF
332; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
333; CHECK-NEXT:    movw $1, 0
334; CHECK-NEXT:    jmp .LBB8_1
335BB:
336  %Cmp53 = icmp uge <1 x i32> %I49, zeroinitializer
337  br label %CF
338
339CF:                                               ; preds = %CF, %BB
340  %ZE166 = zext <1 x i1> %Cmp53 to <1 x i16>
341  store <1 x i16> %ZE166, ptr null, align 2
342  br label %CF
343}
344
345;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
346; AVX: {{.*}}
347