xref: /llvm-project/llvm/test/CodeGen/X86/vector-pack-512.ll (revision ea9df0982fa8f8049b52bf5b449eed08d4f551e4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f  | FileCheck %s --check-prefixes=AVX512,AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
4
5; trunc(concat(x,y)) -> pack
6
7define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
8; AVX512-LABEL: trunc_concat_packssdw_512:
9; AVX512:       # %bb.0:
10; AVX512-NEXT:    vpsrad $17, %zmm0, %zmm0
11; AVX512-NEXT:    vpsrad $23, %zmm1, %zmm1
12; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
13; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
14; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
15; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
16; AVX512-NEXT:    vpmovdw %zmm3, %ymm0
17; AVX512-NEXT:    vpmovdw %zmm2, %ymm1
18; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
19; AVX512-NEXT:    retq
20  %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
21  %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
22  %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
23  %4 = trunc <32 x i32> %3 to <32 x i16>
24  ret <32 x i16> %4
25}
26
27define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
28; AVX512-LABEL: trunc_concat_packusdw_512:
29; AVX512:       # %bb.0:
30; AVX512-NEXT:    vpsrld $17, %zmm0, %zmm0
31; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
32; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
33; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
34; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
35; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
36; AVX512-NEXT:    vpmovdw %zmm3, %ymm0
37; AVX512-NEXT:    vpmovdw %zmm2, %ymm1
38; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
39; AVX512-NEXT:    retq
40  %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
41  %2 = and  <16 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
42  %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
43  %4 = trunc <32 x i32> %3 to <32 x i16>
44  ret <32 x i16> %4
45}
46
47define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
48; AVX512F-LABEL: trunc_concat_packsswb_512:
49; AVX512F:       # %bb.0:
50; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm2
51; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
52; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm0
53; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
54; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
55; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
56; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
57; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
58; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
59; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
60; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
61; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
62; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
63; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
64; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
65; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
66; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
67; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
68; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
69; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
70; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
71; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
72; AVX512F-NEXT:    retq
73;
74; AVX512BW-LABEL: trunc_concat_packsswb_512:
75; AVX512BW:       # %bb.0:
76; AVX512BW-NEXT:    vpsraw $15, %zmm0, %zmm0
77; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
78; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
79; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
80; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
81; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
82; AVX512BW-NEXT:    vpmovwb %zmm3, %ymm0
83; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm1
84; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
85; AVX512BW-NEXT:    retq
86  %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
87  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
88  %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
89  %4 = trunc <64 x i16> %3 to <64 x i8>
90  ret <64 x i8> %4
91}
92
93define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
94; AVX512F-LABEL: trunc_concat_packuswb_512:
95; AVX512F:       # %bb.0:
96; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm2
97; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
98; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
99; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
100; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
101; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
102; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
103; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
104; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
105; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
106; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
107; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
108; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
109; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
110; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
111; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
112; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
113; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
114; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
115; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
116; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
117; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
118; AVX512F-NEXT:    retq
119;
120; AVX512BW-LABEL: trunc_concat_packuswb_512:
121; AVX512BW:       # %bb.0:
122; AVX512BW-NEXT:    vpsrlw $15, %zmm0, %zmm0
123; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
124; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
125; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
126; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
127; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
128; AVX512BW-NEXT:    vpmovwb %zmm3, %ymm0
129; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm1
130; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
131; AVX512BW-NEXT:    retq
132  %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
133  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
134  %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
135  %4 = trunc <64 x i16> %3 to <64 x i8>
136  ret <64 x i8> %4
137}
138
139; concat(trunc(x),trunc(y)) -> pack
140
141define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
142; AVX512-LABEL: concat_trunc_packssdw_512:
143; AVX512:       # %bb.0:
144; AVX512-NEXT:    vpsrad $17, %zmm0, %zmm0
145; AVX512-NEXT:    vpsrad $23, %zmm1, %zmm1
146; AVX512-NEXT:    vpmovdw %zmm0, %ymm2
147; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
148; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11]
149; AVX512-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
150; AVX512-NEXT:    retq
151  %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
152  %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
153  %3 = trunc <16 x i32> %1 to <16 x i16>
154  %4 = trunc <16 x i32> %2 to <16 x i16>
155  %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
156  ret <32 x i16> %5
157}
158
159define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
160; AVX512-LABEL: concat_trunc_packusdw_512:
161; AVX512:       # %bb.0:
162; AVX512-NEXT:    vpsrld $17, %zmm0, %zmm0
163; AVX512-NEXT:    vpsrld $23, %zmm1, %zmm1
164; AVX512-NEXT:    vpmovdw %zmm0, %ymm2
165; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
166; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11]
167; AVX512-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
168; AVX512-NEXT:    retq
169  %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
170  %2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
171  %3 = trunc <16 x i32> %1 to <16 x i16>
172  %4 = trunc <16 x i32> %2 to <16 x i16>
173  %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
174  ret <32 x i16> %5
175}
176
177define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
178; AVX512F-LABEL: concat_trunc_packsswb_512:
179; AVX512F:       # %bb.0:
180; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
181; AVX512F-NEXT:    vpsraw $15, %ymm2, %ymm2
182; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm0
183; AVX512F-NEXT:    vpacksswb %ymm2, %ymm0, %ymm2
184; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
185; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
186; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
187; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
188; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
189; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
190; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
191; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11]
192; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
193; AVX512F-NEXT:    retq
194;
195; AVX512BW-LABEL: concat_trunc_packsswb_512:
196; AVX512BW:       # %bb.0:
197; AVX512BW-NEXT:    vpsraw $15, %zmm0, %zmm0
198; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2
199; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm0
200; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
201; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11]
202; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
203; AVX512BW-NEXT:    retq
204  %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
205  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
206  %3 = trunc <32 x i16> %1 to <32 x i8>
207  %4 = trunc <32 x i16> %2 to <32 x i8>
208  %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
209  ret <64 x i8> %5
210}
211
212define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
213; AVX512F-LABEL: concat_trunc_packuswb_512:
214; AVX512F:       # %bb.0:
215; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
216; AVX512F-NEXT:    vpsrlw $15, %ymm2, %ymm2
217; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
218; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm2
219; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
220; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
221; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
222; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
223; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
224; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
225; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
226; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11]
227; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
228; AVX512F-NEXT:    retq
229;
230; AVX512BW-LABEL: concat_trunc_packuswb_512:
231; AVX512BW:       # %bb.0:
232; AVX512BW-NEXT:    vpsrlw $15, %zmm0, %zmm0
233; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2
234; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm0
235; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
236; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11]
237; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm0
238; AVX512BW-NEXT:    retq
239  %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
240  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
241  %3 = trunc <32 x i16> %1 to <32 x i8>
242  %4 = trunc <32 x i16> %2 to <32 x i8>
243  %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
244  ret <64 x i8> %5
245}
246
247define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
248; AVX512F-LABEL: concat_packsswd_int_2x256:
249; AVX512F:       # %bb.0:
250; AVX512F-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
251; AVX512F-NEXT:    vpackssdw %ymm3, %ymm2, %ymm1
252; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
253; AVX512F-NEXT:    retq
254;
255; AVX512BW-LABEL: concat_packsswd_int_2x256:
256; AVX512BW:       # %bb.0:
257; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
258; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
259; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
260; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
261; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
262; AVX512BW-NEXT:    retq
263  %lo = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
264  %hi = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
265  %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
266  ret <32 x i16> %res
267}
268declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
269
270define <32 x i16> @concat_packuswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
271; AVX512F-LABEL: concat_packuswd_int_2x256:
272; AVX512F:       # %bb.0:
273; AVX512F-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
274; AVX512F-NEXT:    vpackusdw %ymm3, %ymm2, %ymm1
275; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
276; AVX512F-NEXT:    retq
277;
278; AVX512BW-LABEL: concat_packuswd_int_2x256:
279; AVX512BW:       # %bb.0:
280; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
281; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
282; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
283; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
284; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
285; AVX512BW-NEXT:    retq
286  %lo = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
287  %hi = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
288  %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
289  ret <32 x i16> %res
290}
291declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
292