xref: /llvm-project/llvm/test/CodeGen/X86/vector-trunc.ll (revision 69ffa7be3bda5547d7a41233f86b88539616e386)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16
17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
18; SSE-LABEL: trunc8i64_8i32:
19; SSE:       # %bb.0: # %entry
20; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
21; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
22; SSE-NEXT:    movaps %xmm2, %xmm1
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc8i64_8i32:
26; AVX1:       # %bb.0: # %entry
27; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
28; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
29; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
30; AVX1-NEXT:    retq
31;
32; AVX2-SLOW-LABEL: trunc8i64_8i32:
33; AVX2-SLOW:       # %bb.0: # %entry
34; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
35; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
36; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
37; AVX2-SLOW-NEXT:    retq
38;
39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32:
40; AVX2-FAST-ALL:       # %bb.0: # %entry
41; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
42; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
43; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
44; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
45; AVX2-FAST-ALL-NEXT:    retq
46;
47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32:
48; AVX2-FAST-PERLANE:       # %bb.0: # %entry
49; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
50; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
51; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
52; AVX2-FAST-PERLANE-NEXT:    retq
53;
54; AVX512-LABEL: trunc8i64_8i32:
55; AVX512:       # %bb.0: # %entry
56; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
57; AVX512-NEXT:    retq
58entry:
59  %0 = trunc <8 x i64> %a to <8 x i32>
60  ret <8 x i32> %0
61}
62
63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
64; SSE-LABEL: trunc8i64_8i32_ashr:
65; SSE:       # %bb.0: # %entry
66; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
67; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
68; SSE-NEXT:    movaps %xmm2, %xmm1
69; SSE-NEXT:    retq
70;
71; AVX1-LABEL: trunc8i64_8i32_ashr:
72; AVX1:       # %bb.0: # %entry
73; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
74; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
75; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
76; AVX1-NEXT:    retq
77;
78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
79; AVX2-SLOW:       # %bb.0: # %entry
80; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
81; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
82; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
83; AVX2-SLOW-NEXT:    retq
84;
85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr:
86; AVX2-FAST-ALL:       # %bb.0: # %entry
87; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
88; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
89; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
90; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
91; AVX2-FAST-ALL-NEXT:    retq
92;
93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr:
94; AVX2-FAST-PERLANE:       # %bb.0: # %entry
95; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
96; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
97; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
98; AVX2-FAST-PERLANE-NEXT:    retq
99;
100; AVX512-LABEL: trunc8i64_8i32_ashr:
101; AVX512:       # %bb.0: # %entry
102; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
103; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
104; AVX512-NEXT:    retq
105entry:
106  %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
107  %1 = trunc <8 x i64> %0 to <8 x i32>
108  ret <8 x i32> %1
109}
110
111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
112; SSE-LABEL: trunc8i64_8i32_lshr:
113; SSE:       # %bb.0: # %entry
114; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
115; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
116; SSE-NEXT:    movaps %xmm2, %xmm1
117; SSE-NEXT:    retq
118;
119; AVX1-LABEL: trunc8i64_8i32_lshr:
120; AVX1:       # %bb.0: # %entry
121; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
122; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
123; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
124; AVX1-NEXT:    retq
125;
126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
127; AVX2-SLOW:       # %bb.0: # %entry
128; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
129; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
130; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
131; AVX2-SLOW-NEXT:    retq
132;
133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr:
134; AVX2-FAST-ALL:       # %bb.0: # %entry
135; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
136; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
137; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
138; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
139; AVX2-FAST-ALL-NEXT:    retq
140;
141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr:
142; AVX2-FAST-PERLANE:       # %bb.0: # %entry
143; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
144; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
145; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
146; AVX2-FAST-PERLANE-NEXT:    retq
147;
148; AVX512-LABEL: trunc8i64_8i32_lshr:
149; AVX512:       # %bb.0: # %entry
150; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
151; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
152; AVX512-NEXT:    retq
153entry:
154  %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
155  %1 = trunc <8 x i64> %0 to <8 x i32>
156  ret <8 x i32> %1
157}
158
159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
160; SSE2-SSSE3-LABEL: trunc8i64_8i16:
161; SSE2-SSSE3:       # %bb.0: # %entry
162; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
163; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
164; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
165; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
166; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
167; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
168; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
169; SSE2-SSSE3-NEXT:    retq
170;
171; SSE41-LABEL: trunc8i64_8i16:
172; SSE41:       # %bb.0: # %entry
173; SSE41-NEXT:    pxor %xmm4, %xmm4
174; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
175; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
176; SSE41-NEXT:    packusdw %xmm3, %xmm2
177; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
178; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
179; SSE41-NEXT:    packusdw %xmm1, %xmm0
180; SSE41-NEXT:    packusdw %xmm2, %xmm0
181; SSE41-NEXT:    retq
182;
183; AVX1-LABEL: trunc8i64_8i16:
184; AVX1:       # %bb.0: # %entry
185; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
186; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
187; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
188; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
189; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
190; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
191; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
192; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
193; AVX1-NEXT:    vzeroupper
194; AVX1-NEXT:    retq
195;
196; AVX2-LABEL: trunc8i64_8i16:
197; AVX2:       # %bb.0: # %entry
198; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
199; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
200; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
201; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
203; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
204; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
205; AVX2-NEXT:    vzeroupper
206; AVX2-NEXT:    retq
207;
208; AVX512-LABEL: trunc8i64_8i16:
209; AVX512:       # %bb.0: # %entry
210; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
211; AVX512-NEXT:    vzeroupper
212; AVX512-NEXT:    retq
213entry:
214  %0 = trunc <8 x i64> %a to <8 x i16>
215  ret <8 x i16> %0
216}
217
218define void @trunc8i64_8i8(<8 x i64> %a) {
219; SSE2-SSSE3-LABEL: trunc8i64_8i8:
220; SSE2-SSSE3:       # %bb.0: # %entry
221; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
222; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
223; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
224; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
225; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
226; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
227; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
228; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
229; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
230; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
231; SSE2-SSSE3-NEXT:    retq
232;
233; SSE41-LABEL: trunc8i64_8i8:
234; SSE41:       # %bb.0: # %entry
235; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = [255,255]
236; SSE41-NEXT:    pand %xmm4, %xmm3
237; SSE41-NEXT:    pand %xmm4, %xmm2
238; SSE41-NEXT:    packusdw %xmm3, %xmm2
239; SSE41-NEXT:    pand %xmm4, %xmm1
240; SSE41-NEXT:    pand %xmm4, %xmm0
241; SSE41-NEXT:    packusdw %xmm1, %xmm0
242; SSE41-NEXT:    packusdw %xmm2, %xmm0
243; SSE41-NEXT:    packuswb %xmm0, %xmm0
244; SSE41-NEXT:    movq %xmm0, (%rax)
245; SSE41-NEXT:    retq
246;
247; AVX1-LABEL: trunc8i64_8i8:
248; AVX1:       # %bb.0: # %entry
249; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
250; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
251; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
252; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
253; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
254; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
255; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
256; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
257; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
258; AVX1-NEXT:    vmovq %xmm0, (%rax)
259; AVX1-NEXT:    vzeroupper
260; AVX1-NEXT:    retq
261;
262; AVX2-LABEL: trunc8i64_8i8:
263; AVX2:       # %bb.0: # %entry
264; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
265; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
266; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
267; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
268; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
269; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
270; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
271; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
272; AVX2-NEXT:    vmovq %xmm0, (%rax)
273; AVX2-NEXT:    vzeroupper
274; AVX2-NEXT:    retq
275;
276; AVX512-LABEL: trunc8i64_8i8:
277; AVX512:       # %bb.0: # %entry
278; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
279; AVX512-NEXT:    vzeroupper
280; AVX512-NEXT:    retq
281entry:
282  %0 = trunc <8 x i64> %a to <8 x i8>
283  store <8 x i8> %0, ptr undef, align 4
284  ret void
285}
286
287define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
288; SSE2-LABEL: trunc8i32_8i16:
289; SSE2:       # %bb.0: # %entry
290; SSE2-NEXT:    pslld $16, %xmm1
291; SSE2-NEXT:    psrad $16, %xmm1
292; SSE2-NEXT:    pslld $16, %xmm0
293; SSE2-NEXT:    psrad $16, %xmm0
294; SSE2-NEXT:    packssdw %xmm1, %xmm0
295; SSE2-NEXT:    retq
296;
297; SSSE3-LABEL: trunc8i32_8i16:
298; SSSE3:       # %bb.0: # %entry
299; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
300; SSSE3-NEXT:    pshufb %xmm2, %xmm1
301; SSSE3-NEXT:    pshufb %xmm2, %xmm0
302; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
303; SSSE3-NEXT:    retq
304;
305; SSE41-LABEL: trunc8i32_8i16:
306; SSE41:       # %bb.0: # %entry
307; SSE41-NEXT:    pxor %xmm2, %xmm2
308; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
309; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
310; SSE41-NEXT:    packusdw %xmm1, %xmm0
311; SSE41-NEXT:    retq
312;
313; AVX1-LABEL: trunc8i32_8i16:
314; AVX1:       # %bb.0: # %entry
315; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
316; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
317; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
318; AVX1-NEXT:    vzeroupper
319; AVX1-NEXT:    retq
320;
321; AVX2-LABEL: trunc8i32_8i16:
322; AVX2:       # %bb.0: # %entry
323; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
324; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
325; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
326; AVX2-NEXT:    vzeroupper
327; AVX2-NEXT:    retq
328;
329; AVX512F-LABEL: trunc8i32_8i16:
330; AVX512F:       # %bb.0: # %entry
331; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
332; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
333; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
334; AVX512F-NEXT:    vzeroupper
335; AVX512F-NEXT:    retq
336;
337; AVX512VL-LABEL: trunc8i32_8i16:
338; AVX512VL:       # %bb.0: # %entry
339; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
340; AVX512VL-NEXT:    vzeroupper
341; AVX512VL-NEXT:    retq
342;
343; AVX512BW-LABEL: trunc8i32_8i16:
344; AVX512BW:       # %bb.0: # %entry
345; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
346; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
347; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
348; AVX512BW-NEXT:    vzeroupper
349; AVX512BW-NEXT:    retq
350;
351; AVX512BWVL-LABEL: trunc8i32_8i16:
352; AVX512BWVL:       # %bb.0: # %entry
353; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
354; AVX512BWVL-NEXT:    vzeroupper
355; AVX512BWVL-NEXT:    retq
356entry:
357  %0 = trunc <8 x i32> %a to <8 x i16>
358  ret <8 x i16> %0
359}
360
361define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
362; SSE2-SSSE3-LABEL: trunc8i32_8i16_ashr:
363; SSE2-SSSE3:       # %bb.0: # %entry
364; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
365; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
366; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
367; SSE2-SSSE3-NEXT:    retq
368;
369; SSE41-LABEL: trunc8i32_8i16_ashr:
370; SSE41:       # %bb.0: # %entry
371; SSE41-NEXT:    psrld $16, %xmm1
372; SSE41-NEXT:    psrld $16, %xmm0
373; SSE41-NEXT:    packusdw %xmm1, %xmm0
374; SSE41-NEXT:    retq
375;
376; AVX1-LABEL: trunc8i32_8i16_ashr:
377; AVX1:       # %bb.0: # %entry
378; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
379; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
380; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
381; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
382; AVX1-NEXT:    vzeroupper
383; AVX1-NEXT:    retq
384;
385; AVX2-LABEL: trunc8i32_8i16_ashr:
386; AVX2:       # %bb.0: # %entry
387; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
388; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
389; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
390; AVX2-NEXT:    vzeroupper
391; AVX2-NEXT:    retq
392;
393; AVX512F-LABEL: trunc8i32_8i16_ashr:
394; AVX512F:       # %bb.0: # %entry
395; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
396; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
397; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
398; AVX512F-NEXT:    vzeroupper
399; AVX512F-NEXT:    retq
400;
401; AVX512VL-LABEL: trunc8i32_8i16_ashr:
402; AVX512VL:       # %bb.0: # %entry
403; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
404; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
405; AVX512VL-NEXT:    vzeroupper
406; AVX512VL-NEXT:    retq
407;
408; AVX512BW-LABEL: trunc8i32_8i16_ashr:
409; AVX512BW:       # %bb.0: # %entry
410; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
411; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
412; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
413; AVX512BW-NEXT:    vzeroupper
414; AVX512BW-NEXT:    retq
415;
416; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
417; AVX512BWVL:       # %bb.0: # %entry
418; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
419; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
420; AVX512BWVL-NEXT:    vzeroupper
421; AVX512BWVL-NEXT:    retq
422entry:
423  %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
424  %1 = trunc <8 x i32> %0 to <8 x i16>
425  ret <8 x i16> %1
426}
427
428define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
429; SSE2-SSSE3-LABEL: trunc8i32_8i16_lshr:
430; SSE2-SSSE3:       # %bb.0: # %entry
431; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
432; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
433; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
434; SSE2-SSSE3-NEXT:    retq
435;
436; SSE41-LABEL: trunc8i32_8i16_lshr:
437; SSE41:       # %bb.0: # %entry
438; SSE41-NEXT:    psrld $16, %xmm1
439; SSE41-NEXT:    psrld $16, %xmm0
440; SSE41-NEXT:    packusdw %xmm1, %xmm0
441; SSE41-NEXT:    retq
442;
443; AVX1-LABEL: trunc8i32_8i16_lshr:
444; AVX1:       # %bb.0: # %entry
445; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
446; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
447; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
448; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
449; AVX1-NEXT:    vzeroupper
450; AVX1-NEXT:    retq
451;
452; AVX2-LABEL: trunc8i32_8i16_lshr:
453; AVX2:       # %bb.0: # %entry
454; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
455; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
456; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
457; AVX2-NEXT:    vzeroupper
458; AVX2-NEXT:    retq
459;
460; AVX512F-LABEL: trunc8i32_8i16_lshr:
461; AVX512F:       # %bb.0: # %entry
462; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
463; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
464; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
465; AVX512F-NEXT:    vzeroupper
466; AVX512F-NEXT:    retq
467;
468; AVX512VL-LABEL: trunc8i32_8i16_lshr:
469; AVX512VL:       # %bb.0: # %entry
470; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
471; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
472; AVX512VL-NEXT:    vzeroupper
473; AVX512VL-NEXT:    retq
474;
475; AVX512BW-LABEL: trunc8i32_8i16_lshr:
476; AVX512BW:       # %bb.0: # %entry
477; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
478; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
479; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
480; AVX512BW-NEXT:    vzeroupper
481; AVX512BW-NEXT:    retq
482;
483; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
484; AVX512BWVL:       # %bb.0: # %entry
485; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
486; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
487; AVX512BWVL-NEXT:    vzeroupper
488; AVX512BWVL-NEXT:    retq
489entry:
490  %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
491  %1 = trunc <8 x i32> %0 to <8 x i16>
492  ret <8 x i16> %1
493}
494
495define void @trunc8i32_8i8(<8 x i32> %a) {
496; SSE2-SSSE3-LABEL: trunc8i32_8i8:
497; SSE2-SSSE3:       # %bb.0: # %entry
498; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
499; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
500; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
501; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
502; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
503; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
504; SSE2-SSSE3-NEXT:    retq
505;
506; SSE41-LABEL: trunc8i32_8i8:
507; SSE41:       # %bb.0: # %entry
508; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
509; SSE41-NEXT:    pand %xmm2, %xmm1
510; SSE41-NEXT:    pand %xmm2, %xmm0
511; SSE41-NEXT:    packusdw %xmm1, %xmm0
512; SSE41-NEXT:    packuswb %xmm0, %xmm0
513; SSE41-NEXT:    movq %xmm0, (%rax)
514; SSE41-NEXT:    retq
515;
516; AVX1-LABEL: trunc8i32_8i8:
517; AVX1:       # %bb.0: # %entry
518; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
519; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
520; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
521; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
522; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
523; AVX1-NEXT:    vmovq %xmm0, (%rax)
524; AVX1-NEXT:    vzeroupper
525; AVX1-NEXT:    retq
526;
527; AVX2-LABEL: trunc8i32_8i8:
528; AVX2:       # %bb.0: # %entry
529; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
530; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
531; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
532; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
533; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
534; AVX2-NEXT:    vmovq %xmm0, (%rax)
535; AVX2-NEXT:    vzeroupper
536; AVX2-NEXT:    retq
537;
538; AVX512F-LABEL: trunc8i32_8i8:
539; AVX512F:       # %bb.0: # %entry
540; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
541; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
542; AVX512F-NEXT:    vmovq %xmm0, (%rax)
543; AVX512F-NEXT:    vzeroupper
544; AVX512F-NEXT:    retq
545;
546; AVX512VL-LABEL: trunc8i32_8i8:
547; AVX512VL:       # %bb.0: # %entry
548; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
549; AVX512VL-NEXT:    vzeroupper
550; AVX512VL-NEXT:    retq
551;
552; AVX512BW-LABEL: trunc8i32_8i8:
553; AVX512BW:       # %bb.0: # %entry
554; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
555; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
556; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
557; AVX512BW-NEXT:    vzeroupper
558; AVX512BW-NEXT:    retq
559;
560; AVX512BWVL-LABEL: trunc8i32_8i8:
561; AVX512BWVL:       # %bb.0: # %entry
562; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
563; AVX512BWVL-NEXT:    vzeroupper
564; AVX512BWVL-NEXT:    retq
565entry:
566  %0 = trunc <8 x i32> %a to <8 x i8>
567  store <8 x i8> %0, ptr undef, align 4
568  ret void
569}
570
571define void @trunc16i32_16i16(<16 x i32> %a) {
572; SSE2-LABEL: trunc16i32_16i16:
573; SSE2:       # %bb.0: # %entry
574; SSE2-NEXT:    pslld $16, %xmm1
575; SSE2-NEXT:    psrad $16, %xmm1
576; SSE2-NEXT:    pslld $16, %xmm0
577; SSE2-NEXT:    psrad $16, %xmm0
578; SSE2-NEXT:    packssdw %xmm1, %xmm0
579; SSE2-NEXT:    pslld $16, %xmm3
580; SSE2-NEXT:    psrad $16, %xmm3
581; SSE2-NEXT:    pslld $16, %xmm2
582; SSE2-NEXT:    psrad $16, %xmm2
583; SSE2-NEXT:    packssdw %xmm3, %xmm2
584; SSE2-NEXT:    movdqu %xmm2, (%rax)
585; SSE2-NEXT:    movdqu %xmm0, (%rax)
586; SSE2-NEXT:    retq
587;
588; SSSE3-LABEL: trunc16i32_16i16:
589; SSSE3:       # %bb.0: # %entry
590; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
591; SSSE3-NEXT:    pshufb %xmm4, %xmm1
592; SSSE3-NEXT:    pshufb %xmm4, %xmm0
593; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
594; SSSE3-NEXT:    pshufb %xmm4, %xmm3
595; SSSE3-NEXT:    pshufb %xmm4, %xmm2
596; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
597; SSSE3-NEXT:    movdqu %xmm2, (%rax)
598; SSSE3-NEXT:    movdqu %xmm0, (%rax)
599; SSSE3-NEXT:    retq
600;
601; SSE41-LABEL: trunc16i32_16i16:
602; SSE41:       # %bb.0: # %entry
603; SSE41-NEXT:    pxor %xmm4, %xmm4
604; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
605; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
606; SSE41-NEXT:    packusdw %xmm1, %xmm0
607; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
608; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
609; SSE41-NEXT:    packusdw %xmm3, %xmm2
610; SSE41-NEXT:    movdqu %xmm2, (%rax)
611; SSE41-NEXT:    movdqu %xmm0, (%rax)
612; SSE41-NEXT:    retq
613;
614; AVX1-LABEL: trunc16i32_16i16:
615; AVX1:       # %bb.0: # %entry
616; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
617; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
618; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
619; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
620; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
621; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
622; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
623; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
624; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
625; AVX1-NEXT:    vzeroupper
626; AVX1-NEXT:    retq
627;
628; AVX2-LABEL: trunc16i32_16i16:
629; AVX2:       # %bb.0: # %entry
630; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
631; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
632; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
633; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
634; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
635; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
636; AVX2-NEXT:    vzeroupper
637; AVX2-NEXT:    retq
638;
639; AVX512-LABEL: trunc16i32_16i16:
640; AVX512:       # %bb.0: # %entry
641; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
642; AVX512-NEXT:    vzeroupper
643; AVX512-NEXT:    retq
644entry:
645  %0 = trunc <16 x i32> %a to <16 x i16>
646  store <16 x i16> %0, ptr undef, align 4
647  ret void
648}
649
650define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
651; SSE2-SSSE3-LABEL: trunc16i32_16i16_ashr:
652; SSE2-SSSE3:       # %bb.0: # %entry
653; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
654; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
655; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
656; SSE2-SSSE3-NEXT:    psrad $16, %xmm3
657; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
658; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm2
659; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
660; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
661; SSE2-SSSE3-NEXT:    retq
662;
663; SSE41-LABEL: trunc16i32_16i16_ashr:
664; SSE41:       # %bb.0: # %entry
665; SSE41-NEXT:    psrld $16, %xmm3
666; SSE41-NEXT:    psrld $16, %xmm2
667; SSE41-NEXT:    packusdw %xmm3, %xmm2
668; SSE41-NEXT:    psrld $16, %xmm1
669; SSE41-NEXT:    psrld $16, %xmm0
670; SSE41-NEXT:    packusdw %xmm1, %xmm0
671; SSE41-NEXT:    movdqu %xmm2, (%rax)
672; SSE41-NEXT:    movdqu %xmm0, (%rax)
673; SSE41-NEXT:    retq
674;
675; AVX1-LABEL: trunc16i32_16i16_ashr:
676; AVX1:       # %bb.0: # %entry
677; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
678; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
679; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
680; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
681; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
682; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
683; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
684; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
685; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
686; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
687; AVX1-NEXT:    vzeroupper
688; AVX1-NEXT:    retq
689;
690; AVX2-LABEL: trunc16i32_16i16_ashr:
691; AVX2:       # %bb.0: # %entry
692; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
693; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
694; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
695; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
696; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
697; AVX2-NEXT:    vzeroupper
698; AVX2-NEXT:    retq
699;
700; AVX512-LABEL: trunc16i32_16i16_ashr:
701; AVX512:       # %bb.0: # %entry
702; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
703; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
704; AVX512-NEXT:    vzeroupper
705; AVX512-NEXT:    retq
706entry:
707  %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
708  %1 = trunc <16 x i32> %0 to <16 x i16>
709  store <16 x i16> %1, ptr undef, align 4
710  ret void
711}
712
713define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
714; SSE2-SSSE3-LABEL: trunc16i32_16i16_lshr:
715; SSE2-SSSE3:       # %bb.0: # %entry
716; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
717; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
718; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
719; SSE2-SSSE3-NEXT:    psrad $16, %xmm3
720; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
721; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm2
722; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
723; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
724; SSE2-SSSE3-NEXT:    retq
725;
726; SSE41-LABEL: trunc16i32_16i16_lshr:
727; SSE41:       # %bb.0: # %entry
728; SSE41-NEXT:    psrld $16, %xmm3
729; SSE41-NEXT:    psrld $16, %xmm2
730; SSE41-NEXT:    packusdw %xmm3, %xmm2
731; SSE41-NEXT:    psrld $16, %xmm1
732; SSE41-NEXT:    psrld $16, %xmm0
733; SSE41-NEXT:    packusdw %xmm1, %xmm0
734; SSE41-NEXT:    movdqu %xmm2, (%rax)
735; SSE41-NEXT:    movdqu %xmm0, (%rax)
736; SSE41-NEXT:    retq
737;
738; AVX1-LABEL: trunc16i32_16i16_lshr:
739; AVX1:       # %bb.0: # %entry
740; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
741; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
742; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
743; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
744; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
745; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
746; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
747; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
748; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
749; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
750; AVX1-NEXT:    vzeroupper
751; AVX1-NEXT:    retq
752;
753; AVX2-LABEL: trunc16i32_16i16_lshr:
754; AVX2:       # %bb.0: # %entry
755; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
756; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
757; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
758; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
759; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
760; AVX2-NEXT:    vzeroupper
761; AVX2-NEXT:    retq
762;
763; AVX512-LABEL: trunc16i32_16i16_lshr:
764; AVX512:       # %bb.0: # %entry
765; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
766; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
767; AVX512-NEXT:    vzeroupper
768; AVX512-NEXT:    retq
769entry:
770  %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
771  %1 = trunc <16 x i32> %0 to <16 x i16>
772  store <16 x i16> %1, ptr undef, align 4
773  ret void
774}
775
776define void @trunc16i32_16i8(<16 x i32> %a) {
777; SSE2-SSSE3-LABEL: trunc16i32_16i8:
778; SSE2-SSSE3:       # %bb.0: # %entry
779; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
780; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
781; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
782; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
783; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
784; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
785; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
786; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
787; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
788; SSE2-SSSE3-NEXT:    retq
789;
790; SSE41-LABEL: trunc16i32_16i8:
791; SSE41:       # %bb.0: # %entry
792; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
793; SSE41-NEXT:    pand %xmm4, %xmm3
794; SSE41-NEXT:    pand %xmm4, %xmm2
795; SSE41-NEXT:    packusdw %xmm3, %xmm2
796; SSE41-NEXT:    pand %xmm4, %xmm1
797; SSE41-NEXT:    pand %xmm4, %xmm0
798; SSE41-NEXT:    packusdw %xmm1, %xmm0
799; SSE41-NEXT:    packuswb %xmm2, %xmm0
800; SSE41-NEXT:    movdqu %xmm0, (%rax)
801; SSE41-NEXT:    retq
802;
803; AVX1-LABEL: trunc16i32_16i8:
804; AVX1:       # %bb.0: # %entry
805; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
806; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
807; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
808; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
809; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
810; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
811; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
812; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
813; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
814; AVX1-NEXT:    vzeroupper
815; AVX1-NEXT:    retq
816;
817; AVX2-LABEL: trunc16i32_16i8:
818; AVX2:       # %bb.0: # %entry
819; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
820; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
821; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
822; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
823; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
824; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
825; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
826; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
827; AVX2-NEXT:    vzeroupper
828; AVX2-NEXT:    retq
829;
830; AVX512-LABEL: trunc16i32_16i8:
831; AVX512:       # %bb.0: # %entry
832; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
833; AVX512-NEXT:    vzeroupper
834; AVX512-NEXT:    retq
835entry:
836  %0 = trunc <16 x i32> %a to <16 x i8>
837  store <16 x i8> %0, ptr undef, align 4
838  ret void
839}
840
841define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
842; SSE2-SSSE3-LABEL: trunc16i32_16i8_ashr:
843; SSE2-SSSE3:       # %bb.0: # %entry
844; SSE2-SSSE3-NEXT:    psrld $24, %xmm1
845; SSE2-SSSE3-NEXT:    psrld $24, %xmm0
846; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
847; SSE2-SSSE3-NEXT:    psrld $24, %xmm3
848; SSE2-SSSE3-NEXT:    psrld $24, %xmm2
849; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
850; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
851; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
852; SSE2-SSSE3-NEXT:    retq
853;
854; SSE41-LABEL: trunc16i32_16i8_ashr:
855; SSE41:       # %bb.0: # %entry
856; SSE41-NEXT:    psrld $24, %xmm1
857; SSE41-NEXT:    psrld $24, %xmm0
858; SSE41-NEXT:    packusdw %xmm1, %xmm0
859; SSE41-NEXT:    psrld $24, %xmm3
860; SSE41-NEXT:    psrld $24, %xmm2
861; SSE41-NEXT:    packusdw %xmm3, %xmm2
862; SSE41-NEXT:    packuswb %xmm2, %xmm0
863; SSE41-NEXT:    movdqu %xmm0, (%rax)
864; SSE41-NEXT:    retq
865;
866; AVX1-LABEL: trunc16i32_16i8_ashr:
867; AVX1:       # %bb.0: # %entry
868; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
869; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
870; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
871; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
872; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
873; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
874; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
875; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
876; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
877; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
878; AVX1-NEXT:    vzeroupper
879; AVX1-NEXT:    retq
880;
881; AVX2-LABEL: trunc16i32_16i8_ashr:
882; AVX2:       # %bb.0: # %entry
883; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
884; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
885; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
886; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
887; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
888; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
889; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
890; AVX2-NEXT:    vzeroupper
891; AVX2-NEXT:    retq
892;
893; AVX512-LABEL: trunc16i32_16i8_ashr:
894; AVX512:       # %bb.0: # %entry
895; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
896; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
897; AVX512-NEXT:    vzeroupper
898; AVX512-NEXT:    retq
899entry:
900  %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
901  %1 = trunc <16 x i32> %0 to <16 x i8>
902  store <16 x i8> %1, ptr undef, align 4
903  ret void
904}
905
906define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
907; SSE2-SSSE3-LABEL: trunc16i32_16i8_lshr:
908; SSE2-SSSE3:       # %bb.0: # %entry
909; SSE2-SSSE3-NEXT:    psrld $24, %xmm1
910; SSE2-SSSE3-NEXT:    psrld $24, %xmm0
911; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
912; SSE2-SSSE3-NEXT:    psrld $24, %xmm3
913; SSE2-SSSE3-NEXT:    psrld $24, %xmm2
914; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
915; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
916; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
917; SSE2-SSSE3-NEXT:    retq
918;
919; SSE41-LABEL: trunc16i32_16i8_lshr:
920; SSE41:       # %bb.0: # %entry
921; SSE41-NEXT:    psrld $24, %xmm1
922; SSE41-NEXT:    psrld $24, %xmm0
923; SSE41-NEXT:    packusdw %xmm1, %xmm0
924; SSE41-NEXT:    psrld $24, %xmm3
925; SSE41-NEXT:    psrld $24, %xmm2
926; SSE41-NEXT:    packusdw %xmm3, %xmm2
927; SSE41-NEXT:    packuswb %xmm2, %xmm0
928; SSE41-NEXT:    movdqu %xmm0, (%rax)
929; SSE41-NEXT:    retq
930;
931; AVX1-LABEL: trunc16i32_16i8_lshr:
932; AVX1:       # %bb.0: # %entry
933; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
934; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
935; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
936; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
937; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
938; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
939; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
940; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
941; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
942; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
943; AVX1-NEXT:    vzeroupper
944; AVX1-NEXT:    retq
945;
946; AVX2-LABEL: trunc16i32_16i8_lshr:
947; AVX2:       # %bb.0: # %entry
948; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
949; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
950; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
951; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
952; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
953; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
954; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
955; AVX2-NEXT:    vzeroupper
956; AVX2-NEXT:    retq
957;
958; AVX512-LABEL: trunc16i32_16i8_lshr:
959; AVX512:       # %bb.0: # %entry
960; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
961; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
962; AVX512-NEXT:    vzeroupper
963; AVX512-NEXT:    retq
964entry:
965  %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
966  %1 = trunc <16 x i32> %0 to <16 x i8>
967  store <16 x i8> %1, ptr undef, align 4
968  ret void
969}
970
971;PR25684
972define void @trunc16i16_16i8(<16 x i16> %a) {
973; SSE2-SSSE3-LABEL: trunc16i16_16i8:
974; SSE2-SSSE3:       # %bb.0: # %entry
975; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
976; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
977; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
978; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
979; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
980; SSE2-SSSE3-NEXT:    retq
981;
982; SSE41-LABEL: trunc16i16_16i8:
983; SSE41:       # %bb.0: # %entry
984; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
985; SSE41-NEXT:    pand %xmm2, %xmm1
986; SSE41-NEXT:    pand %xmm2, %xmm0
987; SSE41-NEXT:    packuswb %xmm1, %xmm0
988; SSE41-NEXT:    movdqu %xmm0, (%rax)
989; SSE41-NEXT:    retq
990;
991; AVX1-LABEL: trunc16i16_16i8:
992; AVX1:       # %bb.0: # %entry
993; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
994; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
995; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
996; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
997; AVX1-NEXT:    vzeroupper
998; AVX1-NEXT:    retq
999;
1000; AVX2-LABEL: trunc16i16_16i8:
1001; AVX2:       # %bb.0: # %entry
1002; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1003; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1004; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1005; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1006; AVX2-NEXT:    vzeroupper
1007; AVX2-NEXT:    retq
1008;
1009; AVX512F-LABEL: trunc16i16_16i8:
1010; AVX512F:       # %bb.0: # %entry
1011; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1012; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1013; AVX512F-NEXT:    vzeroupper
1014; AVX512F-NEXT:    retq
1015;
1016; AVX512VL-LABEL: trunc16i16_16i8:
1017; AVX512VL:       # %bb.0: # %entry
1018; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1019; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1020; AVX512VL-NEXT:    vzeroupper
1021; AVX512VL-NEXT:    retq
1022;
1023; AVX512BW-LABEL: trunc16i16_16i8:
1024; AVX512BW:       # %bb.0: # %entry
1025; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1026; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1027; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1028; AVX512BW-NEXT:    vzeroupper
1029; AVX512BW-NEXT:    retq
1030;
1031; AVX512BWVL-LABEL: trunc16i16_16i8:
1032; AVX512BWVL:       # %bb.0: # %entry
1033; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1034; AVX512BWVL-NEXT:    vzeroupper
1035; AVX512BWVL-NEXT:    retq
1036entry:
1037  %0 = trunc <16 x i16> %a to <16 x i8>
1038  store <16 x i8> %0, ptr undef, align 4
1039  ret void
1040}
1041
1042define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
1043; SSE-LABEL: trunc16i16_16i8_ashr:
1044; SSE:       # %bb.0: # %entry
1045; SSE-NEXT:    psrlw $8, %xmm1
1046; SSE-NEXT:    psrlw $8, %xmm0
1047; SSE-NEXT:    packuswb %xmm1, %xmm0
1048; SSE-NEXT:    movdqu %xmm0, (%rax)
1049; SSE-NEXT:    retq
1050;
1051; AVX1-LABEL: trunc16i16_16i8_ashr:
1052; AVX1:       # %bb.0: # %entry
1053; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1054; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1055; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1056; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1057; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1058; AVX1-NEXT:    vzeroupper
1059; AVX1-NEXT:    retq
1060;
1061; AVX2-LABEL: trunc16i16_16i8_ashr:
1062; AVX2:       # %bb.0: # %entry
1063; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1064; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1065; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1066; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1067; AVX2-NEXT:    vzeroupper
1068; AVX2-NEXT:    retq
1069;
1070; AVX512F-LABEL: trunc16i16_16i8_ashr:
1071; AVX512F:       # %bb.0: # %entry
1072; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1073; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1074; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1075; AVX512F-NEXT:    vzeroupper
1076; AVX512F-NEXT:    retq
1077;
1078; AVX512VL-LABEL: trunc16i16_16i8_ashr:
1079; AVX512VL:       # %bb.0: # %entry
1080; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1081; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1082; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1083; AVX512VL-NEXT:    vzeroupper
1084; AVX512VL-NEXT:    retq
1085;
1086; AVX512BW-LABEL: trunc16i16_16i8_ashr:
1087; AVX512BW:       # %bb.0: # %entry
1088; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1089; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1090; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1091; AVX512BW-NEXT:    vzeroupper
1092; AVX512BW-NEXT:    retq
1093;
1094; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
1095; AVX512BWVL:       # %bb.0: # %entry
1096; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1097; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1098; AVX512BWVL-NEXT:    vzeroupper
1099; AVX512BWVL-NEXT:    retq
1100entry:
1101  %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1102  %1 = trunc <16 x i16> %0 to <16 x i8>
1103  store <16 x i8> %1, ptr undef, align 4
1104  ret void
1105}
1106
1107define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
1108; SSE-LABEL: trunc16i16_16i8_lshr:
1109; SSE:       # %bb.0: # %entry
1110; SSE-NEXT:    psrlw $8, %xmm1
1111; SSE-NEXT:    psrlw $8, %xmm0
1112; SSE-NEXT:    packuswb %xmm1, %xmm0
1113; SSE-NEXT:    movdqu %xmm0, (%rax)
1114; SSE-NEXT:    retq
1115;
1116; AVX1-LABEL: trunc16i16_16i8_lshr:
1117; AVX1:       # %bb.0: # %entry
1118; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1119; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1120; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1121; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1122; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1123; AVX1-NEXT:    vzeroupper
1124; AVX1-NEXT:    retq
1125;
1126; AVX2-LABEL: trunc16i16_16i8_lshr:
1127; AVX2:       # %bb.0: # %entry
1128; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1129; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1130; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1131; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1132; AVX2-NEXT:    vzeroupper
1133; AVX2-NEXT:    retq
1134;
1135; AVX512F-LABEL: trunc16i16_16i8_lshr:
1136; AVX512F:       # %bb.0: # %entry
1137; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1138; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1139; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1140; AVX512F-NEXT:    vzeroupper
1141; AVX512F-NEXT:    retq
1142;
1143; AVX512VL-LABEL: trunc16i16_16i8_lshr:
1144; AVX512VL:       # %bb.0: # %entry
1145; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1146; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1147; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1148; AVX512VL-NEXT:    vzeroupper
1149; AVX512VL-NEXT:    retq
1150;
1151; AVX512BW-LABEL: trunc16i16_16i8_lshr:
1152; AVX512BW:       # %bb.0: # %entry
1153; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1154; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1155; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1156; AVX512BW-NEXT:    vzeroupper
1157; AVX512BW-NEXT:    retq
1158;
1159; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
1160; AVX512BWVL:       # %bb.0: # %entry
1161; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1162; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1163; AVX512BWVL-NEXT:    vzeroupper
1164; AVX512BWVL-NEXT:    retq
1165entry:
1166  %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1167  %1 = trunc <16 x i16> %0 to <16 x i8>
1168  store <16 x i8> %1, ptr undef, align 4
1169  ret void
1170}
1171
1172define void @trunc32i16_32i8(<32 x i16> %a) {
1173; SSE2-SSSE3-LABEL: trunc32i16_32i8:
1174; SSE2-SSSE3:       # %bb.0: # %entry
1175; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1176; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
1177; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
1178; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
1179; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
1180; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
1181; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
1182; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
1183; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
1184; SSE2-SSSE3-NEXT:    retq
1185;
1186; SSE41-LABEL: trunc32i16_32i8:
1187; SSE41:       # %bb.0: # %entry
1188; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1189; SSE41-NEXT:    pand %xmm4, %xmm1
1190; SSE41-NEXT:    pand %xmm4, %xmm0
1191; SSE41-NEXT:    packuswb %xmm1, %xmm0
1192; SSE41-NEXT:    pand %xmm4, %xmm3
1193; SSE41-NEXT:    pand %xmm4, %xmm2
1194; SSE41-NEXT:    packuswb %xmm3, %xmm2
1195; SSE41-NEXT:    movdqu %xmm2, (%rax)
1196; SSE41-NEXT:    movdqu %xmm0, (%rax)
1197; SSE41-NEXT:    retq
1198;
1199; AVX1-LABEL: trunc32i16_32i8:
1200; AVX1:       # %bb.0: # %entry
1201; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1202; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1203; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1204; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1205; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1206; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1207; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1208; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1209; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1210; AVX1-NEXT:    vzeroupper
1211; AVX1-NEXT:    retq
1212;
1213; AVX2-LABEL: trunc32i16_32i8:
1214; AVX2:       # %bb.0: # %entry
1215; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1216; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1217; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1218; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1219; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1220; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1221; AVX2-NEXT:    vzeroupper
1222; AVX2-NEXT:    retq
1223;
1224; AVX512F-LABEL: trunc32i16_32i8:
1225; AVX512F:       # %bb.0: # %entry
1226; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1227; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1228; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
1229; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1230; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1231; AVX512F-NEXT:    vzeroupper
1232; AVX512F-NEXT:    retq
1233;
1234; AVX512VL-LABEL: trunc32i16_32i8:
1235; AVX512VL:       # %bb.0: # %entry
1236; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1237; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1238; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
1239; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1240; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1241; AVX512VL-NEXT:    vzeroupper
1242; AVX512VL-NEXT:    retq
1243;
1244; AVX512BW-LABEL: trunc32i16_32i8:
1245; AVX512BW:       # %bb.0: # %entry
1246; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
1247; AVX512BW-NEXT:    vzeroupper
1248; AVX512BW-NEXT:    retq
1249;
1250; AVX512BWVL-LABEL: trunc32i16_32i8:
1251; AVX512BWVL:       # %bb.0: # %entry
1252; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
1253; AVX512BWVL-NEXT:    vzeroupper
1254; AVX512BWVL-NEXT:    retq
1255entry:
1256  %0 = trunc <32 x i16> %a to <32 x i8>
1257  store <32 x i8> %0, ptr undef, align 4
1258  ret void
1259}
1260
1261define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
1262; SSE-LABEL: trunc2x4i64_8i32:
1263; SSE:       # %bb.0: # %entry
1264; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1265; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1266; SSE-NEXT:    movaps %xmm2, %xmm1
1267; SSE-NEXT:    retq
1268;
1269; AVX1-LABEL: trunc2x4i64_8i32:
1270; AVX1:       # %bb.0: # %entry
1271; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1272; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1273; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1274; AVX1-NEXT:    retq
1275;
1276; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
1277; AVX2-SLOW:       # %bb.0: # %entry
1278; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1279; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1280; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1281; AVX2-SLOW-NEXT:    retq
1282;
1283; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32:
1284; AVX2-FAST-ALL:       # %bb.0: # %entry
1285; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1286; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
1287; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1288; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1289; AVX2-FAST-ALL-NEXT:    retq
1290;
1291; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32:
1292; AVX2-FAST-PERLANE:       # %bb.0: # %entry
1293; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1294; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1295; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1296; AVX2-FAST-PERLANE-NEXT:    retq
1297;
1298; AVX512-LABEL: trunc2x4i64_8i32:
1299; AVX512:       # %bb.0: # %entry
1300; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1301; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1302; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1303; AVX512-NEXT:    retq
1304entry:
1305  %0 = trunc <4 x i64> %a to <4 x i32>
1306  %1 = trunc <4 x i64> %b to <4 x i32>
1307  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1308  ret <8 x i32> %2
1309}
1310
1311define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
1312; SSE2-SSSE3-LABEL: trunc2x4i64_8i16:
1313; SSE2-SSSE3:       # %bb.0: # %entry
1314; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1315; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
1316; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
1317; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1318; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
1319; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
1320; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
1321; SSE2-SSSE3-NEXT:    retq
1322;
1323; SSE41-LABEL: trunc2x4i64_8i16:
1324; SSE41:       # %bb.0: # %entry
1325; SSE41-NEXT:    pxor %xmm4, %xmm4
1326; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1327; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1328; SSE41-NEXT:    packusdw %xmm1, %xmm0
1329; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1330; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1331; SSE41-NEXT:    packusdw %xmm3, %xmm2
1332; SSE41-NEXT:    packusdw %xmm2, %xmm0
1333; SSE41-NEXT:    retq
1334;
1335; AVX1-LABEL: trunc2x4i64_8i16:
1336; AVX1:       # %bb.0: # %entry
1337; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1338; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1339; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1340; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1341; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1342; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1343; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1344; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1345; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1346; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1347; AVX1-NEXT:    vzeroupper
1348; AVX1-NEXT:    retq
1349;
1350; AVX2-LABEL: trunc2x4i64_8i16:
1351; AVX2:       # %bb.0: # %entry
1352; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1353; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1354; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1355; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1356; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1357; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1358; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1359; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1360; AVX2-NEXT:    vzeroupper
1361; AVX2-NEXT:    retq
1362;
1363; AVX512F-LABEL: trunc2x4i64_8i16:
1364; AVX512F:       # %bb.0: # %entry
1365; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1366; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1367; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1368; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1369; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1370; AVX512F-NEXT:    vzeroupper
1371; AVX512F-NEXT:    retq
1372;
1373; AVX512VL-LABEL: trunc2x4i64_8i16:
1374; AVX512VL:       # %bb.0: # %entry
1375; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
1376; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
1377; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1378; AVX512VL-NEXT:    vzeroupper
1379; AVX512VL-NEXT:    retq
1380;
1381; AVX512BW-LABEL: trunc2x4i64_8i16:
1382; AVX512BW:       # %bb.0: # %entry
1383; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1384; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1385; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1386; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1387; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1388; AVX512BW-NEXT:    vzeroupper
1389; AVX512BW-NEXT:    retq
1390;
1391; AVX512BWVL-LABEL: trunc2x4i64_8i16:
1392; AVX512BWVL:       # %bb.0: # %entry
1393; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
1394; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
1395; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1396; AVX512BWVL-NEXT:    vzeroupper
1397; AVX512BWVL-NEXT:    retq
1398entry:
1399  %0 = trunc <4 x i64> %a to <4 x i16>
1400  %1 = trunc <4 x i64> %b to <4 x i16>
1401  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1402  ret <8 x i16> %2
1403}
1404
1405define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
1406; SSE-LABEL: trunc2x2i64_4i32:
1407; SSE:       # %bb.0: # %entry
1408; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1409; SSE-NEXT:    retq
1410;
1411; AVX-LABEL: trunc2x2i64_4i32:
1412; AVX:       # %bb.0: # %entry
1413; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1414; AVX-NEXT:    retq
1415;
1416; AVX512F-LABEL: trunc2x2i64_4i32:
1417; AVX512F:       # %bb.0: # %entry
1418; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1419; AVX512F-NEXT:    retq
1420;
1421; AVX512VL-LABEL: trunc2x2i64_4i32:
1422; AVX512VL:       # %bb.0: # %entry
1423; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1424; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1425; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1426; AVX512VL-NEXT:    vzeroupper
1427; AVX512VL-NEXT:    retq
1428;
1429; AVX512BW-LABEL: trunc2x2i64_4i32:
1430; AVX512BW:       # %bb.0: # %entry
1431; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1432; AVX512BW-NEXT:    retq
1433;
1434; AVX512BWVL-LABEL: trunc2x2i64_4i32:
1435; AVX512BWVL:       # %bb.0: # %entry
1436; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1437; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1438; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1439; AVX512BWVL-NEXT:    vzeroupper
1440; AVX512BWVL-NEXT:    retq
1441entry:
1442  %0 = trunc <2 x i64> %a to <2 x i32>
1443  %1 = trunc <2 x i64> %b to <2 x i32>
1444  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1445  ret <4 x i32> %2
1446}
1447
1448define i64 @trunc2i64_i64(<2 x i64> %inval) {
1449; SSE-LABEL: trunc2i64_i64:
1450; SSE:       # %bb.0: # %entry
1451; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1452; SSE-NEXT:    movq %xmm0, %rax
1453; SSE-NEXT:    retq
1454;
1455; AVX-LABEL: trunc2i64_i64:
1456; AVX:       # %bb.0: # %entry
1457; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1458; AVX-NEXT:    vmovq %xmm0, %rax
1459; AVX-NEXT:    retq
1460;
1461; AVX512-LABEL: trunc2i64_i64:
1462; AVX512:       # %bb.0: # %entry
1463; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1464; AVX512-NEXT:    vmovq %xmm0, %rax
1465; AVX512-NEXT:    retq
1466entry:
1467  %0 = trunc <2 x i64> %inval to <2 x i32>
1468  %1 = bitcast <2 x i32> %0 to i64
1469  ret i64 %1
1470}
1471
1472define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
1473; SSE2-LABEL: trunc2x4i32_8i16:
1474; SSE2:       # %bb.0: # %entry
1475; SSE2-NEXT:    pslld $16, %xmm1
1476; SSE2-NEXT:    psrad $16, %xmm1
1477; SSE2-NEXT:    pslld $16, %xmm0
1478; SSE2-NEXT:    psrad $16, %xmm0
1479; SSE2-NEXT:    packssdw %xmm1, %xmm0
1480; SSE2-NEXT:    retq
1481;
1482; SSSE3-LABEL: trunc2x4i32_8i16:
1483; SSSE3:       # %bb.0: # %entry
1484; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1485; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1486; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1487; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1488; SSSE3-NEXT:    retq
1489;
1490; SSE41-LABEL: trunc2x4i32_8i16:
1491; SSE41:       # %bb.0: # %entry
1492; SSE41-NEXT:    pxor %xmm2, %xmm2
1493; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1494; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1495; SSE41-NEXT:    packusdw %xmm1, %xmm0
1496; SSE41-NEXT:    retq
1497;
1498; AVX-LABEL: trunc2x4i32_8i16:
1499; AVX:       # %bb.0: # %entry
1500; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1501; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1502; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1503; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1504; AVX-NEXT:    retq
1505;
1506; AVX512F-LABEL: trunc2x4i32_8i16:
1507; AVX512F:       # %bb.0: # %entry
1508; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1509; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1510; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1511; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1512; AVX512F-NEXT:    vzeroupper
1513; AVX512F-NEXT:    retq
1514;
1515; AVX512VL-LABEL: trunc2x4i32_8i16:
1516; AVX512VL:       # %bb.0: # %entry
1517; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1518; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1519; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1520; AVX512VL-NEXT:    vzeroupper
1521; AVX512VL-NEXT:    retq
1522;
1523; AVX512BW-LABEL: trunc2x4i32_8i16:
1524; AVX512BW:       # %bb.0: # %entry
1525; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1526; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1527; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1528; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1529; AVX512BW-NEXT:    vzeroupper
1530; AVX512BW-NEXT:    retq
1531;
1532; AVX512BWVL-LABEL: trunc2x4i32_8i16:
1533; AVX512BWVL:       # %bb.0: # %entry
1534; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1535; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1536; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1537; AVX512BWVL-NEXT:    vzeroupper
1538; AVX512BWVL-NEXT:    retq
1539entry:
1540  %0 = trunc <4 x i32> %a to <4 x i16>
1541  %1 = trunc <4 x i32> %b to <4 x i16>
1542  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1543  ret <8 x i16> %2
1544}
1545
1546; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1547define i64 @trunc4i32_i64(<4 x i32> %inval) {
1548; SSE2-LABEL: trunc4i32_i64:
1549; SSE2:       # %bb.0: # %entry
1550; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1551; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1552; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1553; SSE2-NEXT:    movq %xmm0, %rax
1554; SSE2-NEXT:    retq
1555;
1556; SSSE3-LABEL: trunc4i32_i64:
1557; SSSE3:       # %bb.0: # %entry
1558; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1559; SSSE3-NEXT:    movq %xmm0, %rax
1560; SSSE3-NEXT:    retq
1561;
1562; SSE41-LABEL: trunc4i32_i64:
1563; SSE41:       # %bb.0: # %entry
1564; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1565; SSE41-NEXT:    movq %xmm0, %rax
1566; SSE41-NEXT:    retq
1567;
1568; AVX-LABEL: trunc4i32_i64:
1569; AVX:       # %bb.0: # %entry
1570; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1571; AVX-NEXT:    vmovq %xmm0, %rax
1572; AVX-NEXT:    retq
1573;
1574; AVX512F-LABEL: trunc4i32_i64:
1575; AVX512F:       # %bb.0: # %entry
1576; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1577; AVX512F-NEXT:    vmovq %xmm0, %rax
1578; AVX512F-NEXT:    retq
1579;
1580; AVX512VL-LABEL: trunc4i32_i64:
1581; AVX512VL:       # %bb.0: # %entry
1582; AVX512VL-NEXT:    vpmovdw %xmm0, %xmm0
1583; AVX512VL-NEXT:    vmovq %xmm0, %rax
1584; AVX512VL-NEXT:    retq
1585;
1586; AVX512BW-LABEL: trunc4i32_i64:
1587; AVX512BW:       # %bb.0: # %entry
1588; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1589; AVX512BW-NEXT:    vmovq %xmm0, %rax
1590; AVX512BW-NEXT:    retq
1591;
1592; AVX512BWVL-LABEL: trunc4i32_i64:
1593; AVX512BWVL:       # %bb.0: # %entry
1594; AVX512BWVL-NEXT:    vpmovdw %xmm0, %xmm0
1595; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1596; AVX512BWVL-NEXT:    retq
1597entry:
1598  %0 = trunc <4 x i32> %inval to <4 x i16>
1599  %1 = bitcast <4 x i16> %0 to i64
1600  ret i64 %1
1601}
1602
1603define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) {
1604; SSE2-SSSE3-LABEL: trunc2x16i16_32i8:
1605; SSE2-SSSE3:       # %bb.0: # %entry
1606; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1607; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
1608; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
1609; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
1610; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
1611; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm4
1612; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm4
1613; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
1614; SSE2-SSSE3-NEXT:    retq
1615;
1616; SSE41-LABEL: trunc2x16i16_32i8:
1617; SSE41:       # %bb.0: # %entry
1618; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1619; SSE41-NEXT:    pand %xmm4, %xmm1
1620; SSE41-NEXT:    pand %xmm4, %xmm0
1621; SSE41-NEXT:    packuswb %xmm1, %xmm0
1622; SSE41-NEXT:    pand %xmm4, %xmm3
1623; SSE41-NEXT:    pand %xmm2, %xmm4
1624; SSE41-NEXT:    packuswb %xmm3, %xmm4
1625; SSE41-NEXT:    movdqa %xmm4, %xmm1
1626; SSE41-NEXT:    retq
1627;
1628; AVX1-LABEL: trunc2x16i16_32i8:
1629; AVX1:       # %bb.0: # %entry
1630; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1631; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1632; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1633; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1634; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1635; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1636; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1637; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1638; AVX1-NEXT:    retq
1639;
1640; AVX2-LABEL: trunc2x16i16_32i8:
1641; AVX2:       # %bb.0: # %entry
1642; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1643; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1644; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1645; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1646; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1647; AVX2-NEXT:    retq
1648;
1649; AVX512F-LABEL: trunc2x16i16_32i8:
1650; AVX512F:       # %bb.0: # %entry
1651; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1652; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1653; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1654; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
1655; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1656; AVX512F-NEXT:    retq
1657;
1658; AVX512VL-LABEL: trunc2x16i16_32i8:
1659; AVX512VL:       # %bb.0: # %entry
1660; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1661; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
1662; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1663; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
1664; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1665; AVX512VL-NEXT:    retq
1666;
1667; AVX512BW-LABEL: trunc2x16i16_32i8:
1668; AVX512BW:       # %bb.0: # %entry
1669; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1670; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1671; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1672; AVX512BW-NEXT:    retq
1673;
1674; AVX512BWVL-LABEL: trunc2x16i16_32i8:
1675; AVX512BWVL:       # %bb.0: # %entry
1676; AVX512BWVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1677; AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1678; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1679; AVX512BWVL-NEXT:    retq
1680entry:
1681  %0 = trunc <16 x i16> %a to <16 x i8>
1682  %1 = trunc <16 x i16> %b to <16 x i8>
1683  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1684  ret <32 x i8> %2
1685}
1686
1687define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
1688; SSE2-SSSE3-LABEL: trunc2x8i16_16i8:
1689; SSE2-SSSE3:       # %bb.0: # %entry
1690; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1691; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
1692; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
1693; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
1694; SSE2-SSSE3-NEXT:    retq
1695;
1696; SSE41-LABEL: trunc2x8i16_16i8:
1697; SSE41:       # %bb.0: # %entry
1698; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1699; SSE41-NEXT:    pand %xmm2, %xmm1
1700; SSE41-NEXT:    pand %xmm2, %xmm0
1701; SSE41-NEXT:    packuswb %xmm1, %xmm0
1702; SSE41-NEXT:    retq
1703;
1704; AVX1-LABEL: trunc2x8i16_16i8:
1705; AVX1:       # %bb.0: # %entry
1706; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1707; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1708; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1709; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1710; AVX1-NEXT:    retq
1711;
1712; AVX2-LABEL: trunc2x8i16_16i8:
1713; AVX2:       # %bb.0: # %entry
1714; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1715; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
1716; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1717; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1718; AVX2-NEXT:    retq
1719;
1720; AVX512F-LABEL: trunc2x8i16_16i8:
1721; AVX512F:       # %bb.0: # %entry
1722; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1723; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
1724; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
1725; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1726; AVX512F-NEXT:    retq
1727;
1728; AVX512VL-LABEL: trunc2x8i16_16i8:
1729; AVX512VL:       # %bb.0: # %entry
1730; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1731; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
1732; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
1733; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1734; AVX512VL-NEXT:    retq
1735;
1736; AVX512BW-LABEL: trunc2x8i16_16i8:
1737; AVX512BW:       # %bb.0: # %entry
1738; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1739; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1740; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1741; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1742; AVX512BW-NEXT:    vzeroupper
1743; AVX512BW-NEXT:    retq
1744;
1745; AVX512BWVL-LABEL: trunc2x8i16_16i8:
1746; AVX512BWVL:       # %bb.0: # %entry
1747; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1748; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1749; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1750; AVX512BWVL-NEXT:    vzeroupper
1751; AVX512BWVL-NEXT:    retq
1752entry:
1753  %0 = trunc <8 x i16> %a to <8 x i8>
1754  %1 = trunc <8 x i16> %b to <8 x i8>
1755  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1756  ret <16 x i8> %2
1757}
1758
1759; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1760define i64 @trunc8i16_i64(<8 x i16> %inval) {
1761; SSE2-LABEL: trunc8i16_i64:
1762; SSE2:       # %bb.0: # %entry
1763; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1764; SSE2-NEXT:    packuswb %xmm0, %xmm0
1765; SSE2-NEXT:    movq %xmm0, %rax
1766; SSE2-NEXT:    retq
1767;
1768; SSSE3-LABEL: trunc8i16_i64:
1769; SSSE3:       # %bb.0: # %entry
1770; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1771; SSSE3-NEXT:    movq %xmm0, %rax
1772; SSSE3-NEXT:    retq
1773;
1774; SSE41-LABEL: trunc8i16_i64:
1775; SSE41:       # %bb.0: # %entry
1776; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1777; SSE41-NEXT:    movq %xmm0, %rax
1778; SSE41-NEXT:    retq
1779;
1780; AVX-LABEL: trunc8i16_i64:
1781; AVX:       # %bb.0: # %entry
1782; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1783; AVX-NEXT:    vmovq %xmm0, %rax
1784; AVX-NEXT:    retq
1785;
1786; AVX512F-LABEL: trunc8i16_i64:
1787; AVX512F:       # %bb.0: # %entry
1788; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1789; AVX512F-NEXT:    vmovq %xmm0, %rax
1790; AVX512F-NEXT:    retq
1791;
1792; AVX512VL-LABEL: trunc8i16_i64:
1793; AVX512VL:       # %bb.0: # %entry
1794; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1795; AVX512VL-NEXT:    vmovq %xmm0, %rax
1796; AVX512VL-NEXT:    retq
1797;
1798; AVX512BW-LABEL: trunc8i16_i64:
1799; AVX512BW:       # %bb.0: # %entry
1800; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1801; AVX512BW-NEXT:    vmovq %xmm0, %rax
1802; AVX512BW-NEXT:    retq
1803;
1804; AVX512BWVL-LABEL: trunc8i16_i64:
1805; AVX512BWVL:       # %bb.0: # %entry
1806; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
1807; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1808; AVX512BWVL-NEXT:    retq
1809entry:
1810  %0 = trunc <8 x i16> %inval to <8 x i8>
1811  %1 = bitcast <8 x i8> %0 to i64
1812  ret i64 %1
1813}
1814
1815define <16 x i8> @trunc16i64_16i8_const() {
1816; SSE-LABEL: trunc16i64_16i8_const:
1817; SSE:       # %bb.0: # %entry
1818; SSE-NEXT:    xorps %xmm0, %xmm0
1819; SSE-NEXT:    retq
1820;
1821; AVX-LABEL: trunc16i64_16i8_const:
1822; AVX:       # %bb.0: # %entry
1823; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1824; AVX-NEXT:    retq
1825;
1826; AVX512-LABEL: trunc16i64_16i8_const:
1827; AVX512:       # %bb.0: # %entry
1828; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1829; AVX512-NEXT:    retq
1830
1831entry:
1832  %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
1833  %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
1834  ret <16 x i8> %1
1835}
1836
1837define <8 x i16> @PR32160(<8 x i32> %x) {
1838; SSE-LABEL: PR32160:
1839; SSE:       # %bb.0:
1840; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1841; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1842; SSE-NEXT:    retq
1843;
1844; AVX-LABEL: PR32160:
1845; AVX:       # %bb.0:
1846; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1847; AVX-NEXT:    vzeroupper
1848; AVX-NEXT:    retq
1849;
1850; AVX512F-LABEL: PR32160:
1851; AVX512F:       # %bb.0:
1852; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1853; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1854; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1855; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
1856; AVX512F-NEXT:    vzeroupper
1857; AVX512F-NEXT:    retq
1858;
1859; AVX512VL-LABEL: PR32160:
1860; AVX512VL:       # %bb.0:
1861; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1862; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1863; AVX512VL-NEXT:    vzeroupper
1864; AVX512VL-NEXT:    retq
1865;
1866; AVX512BW-LABEL: PR32160:
1867; AVX512BW:       # %bb.0:
1868; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1869; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1870; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1871; AVX512BW-NEXT:    vzeroupper
1872; AVX512BW-NEXT:    retq
1873;
1874; AVX512BWVL-LABEL: PR32160:
1875; AVX512BWVL:       # %bb.0:
1876; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1877; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1878; AVX512BWVL-NEXT:    vzeroupper
1879; AVX512BWVL-NEXT:    retq
1880  %shuf = trunc <8 x i32> %x to <8 x i16>
1881  %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1882  ret <8 x i16> %trunc
1883}
1884
1885define void @PR34773(ptr %a0, ptr %a1) {
1886; SSE-LABEL: PR34773:
1887; SSE:       # %bb.0:
1888; SSE-NEXT:    movdqu (%rdi), %xmm0
1889; SSE-NEXT:    movdqu 16(%rdi), %xmm1
1890; SSE-NEXT:    movdqu 32(%rdi), %xmm2
1891; SSE-NEXT:    movdqu 48(%rdi), %xmm3
1892; SSE-NEXT:    psrlw $8, %xmm1
1893; SSE-NEXT:    psrlw $8, %xmm0
1894; SSE-NEXT:    packuswb %xmm1, %xmm0
1895; SSE-NEXT:    psrlw $8, %xmm3
1896; SSE-NEXT:    psrlw $8, %xmm2
1897; SSE-NEXT:    packuswb %xmm3, %xmm2
1898; SSE-NEXT:    movdqu %xmm0, (%rsi)
1899; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
1900; SSE-NEXT:    retq
1901;
1902; AVX1-LABEL: PR34773:
1903; AVX1:       # %bb.0:
1904; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
1905; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
1906; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
1907; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
1908; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1909; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1910; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1911; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1912; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1913; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1914; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
1915; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
1916; AVX1-NEXT:    retq
1917;
1918; AVX2-LABEL: PR34773:
1919; AVX2:       # %bb.0:
1920; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
1921; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
1922; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1923; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1924; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1925; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1926; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1927; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1928; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
1929; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
1930; AVX2-NEXT:    vzeroupper
1931; AVX2-NEXT:    retq
1932;
1933; AVX512F-LABEL: PR34773:
1934; AVX512F:       # %bb.0:
1935; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
1936; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
1937; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1938; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
1939; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1940; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
1941; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1942; AVX512F-NEXT:    vpmovdb %zmm0, 16(%rsi)
1943; AVX512F-NEXT:    vzeroupper
1944; AVX512F-NEXT:    retq
1945;
1946; AVX512VL-LABEL: PR34773:
1947; AVX512VL:       # %bb.0:
1948; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
1949; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
1950; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1951; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
1952; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1953; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
1954; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1955; AVX512VL-NEXT:    vpmovdb %zmm0, 16(%rsi)
1956; AVX512VL-NEXT:    vzeroupper
1957; AVX512VL-NEXT:    retq
1958;
1959; AVX512BW-LABEL: PR34773:
1960; AVX512BW:       # %bb.0:
1961; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
1962; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
1963; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1964; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
1965; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1966; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
1967; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
1968; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
1969; AVX512BW-NEXT:    vzeroupper
1970; AVX512BW-NEXT:    retq
1971;
1972; AVX512BWVL-LABEL: PR34773:
1973; AVX512BWVL:       # %bb.0:
1974; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
1975; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
1976; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
1977; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
1978; AVX512BWVL-NEXT:    vzeroupper
1979; AVX512BWVL-NEXT:    retq
1980  %1  = getelementptr i16, ptr %a0, i64 16
1981  %2  = getelementptr i8, ptr %a1, i64 16
1982  %3  = load <16 x i16>, ptr %a0, align 2
1983  %4  = load <16 x i16>, ptr %1, align 2
1984  %5  = lshr <16 x i16> %3, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1985  %6 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1986  %7 = trunc <16 x i16> %5  to <16 x i8>
1987  %8 = trunc <16 x i16> %6 to <16 x i8>
1988  store <16 x i8> %7, ptr %a1, align 1
1989  store <16 x i8> %8, ptr %2, align 1
1990  ret void
1991}
1992
1993define i16 @PR66194(i8 %q) {
1994; SSE2-SSSE3-LABEL: PR66194:
1995; SSE2-SSSE3:       # %bb.0: # %entry
1996; SSE2-SSSE3-NEXT:    xorl %eax, %eax
1997; SSE2-SSSE3-NEXT:    xorl %ecx, %ecx
1998; SSE2-SSSE3-NEXT:    testb %dil, %dil
1999; SSE2-SSSE3-NEXT:    setne %al
2000; SSE2-SSSE3-NEXT:    sete %cl
2001; SSE2-SSSE3-NEXT:    movl %ecx, %edx
2002; SSE2-SSSE3-NEXT:    shll $16, %edx
2003; SSE2-SSSE3-NEXT:    orl %eax, %edx
2004; SSE2-SSSE3-NEXT:    movd %edx, %xmm0
2005; SSE2-SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
2006; SSE2-SSSE3-NEXT:    pinsrw $3, %eax, %xmm0
2007; SSE2-SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
2008; SSE2-SSSE3-NEXT:    pinsrw $5, %eax, %xmm0
2009; SSE2-SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
2010; SSE2-SSSE3-NEXT:    pinsrw $7, %ecx, %xmm0
2011; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
2012; SSE2-SSSE3-NEXT:    psubw %xmm1, %xmm0
2013; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
2014; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
2015; SSE2-SSSE3-NEXT:    psadbw %xmm0, %xmm1
2016; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
2017; SSE2-SSSE3-NEXT:    # kill: def $ax killed $ax killed $eax
2018; SSE2-SSSE3-NEXT:    retq
2019;
2020; SSE41-LABEL: PR66194:
2021; SSE41:       # %bb.0: # %entry
2022; SSE41-NEXT:    xorl %eax, %eax
2023; SSE41-NEXT:    xorl %ecx, %ecx
2024; SSE41-NEXT:    testb %dil, %dil
2025; SSE41-NEXT:    setne %al
2026; SSE41-NEXT:    sete %cl
2027; SSE41-NEXT:    movd %eax, %xmm0
2028; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2029; SSE41-NEXT:    pinsrb $4, %eax, %xmm0
2030; SSE41-NEXT:    pinsrb $6, %eax, %xmm0
2031; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2032; SSE41-NEXT:    pinsrb $10, %eax, %xmm0
2033; SSE41-NEXT:    pinsrb $12, %eax, %xmm0
2034; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2035; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
2036; SSE41-NEXT:    psubw %xmm1, %xmm0
2037; SSE41-NEXT:    packuswb %xmm0, %xmm0
2038; SSE41-NEXT:    pxor %xmm1, %xmm1
2039; SSE41-NEXT:    psadbw %xmm0, %xmm1
2040; SSE41-NEXT:    movd %xmm1, %eax
2041; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
2042; SSE41-NEXT:    retq
2043;
2044; AVX1-LABEL: PR66194:
2045; AVX1:       # %bb.0: # %entry
2046; AVX1-NEXT:    xorl %eax, %eax
2047; AVX1-NEXT:    testb %dil, %dil
2048; AVX1-NEXT:    setne %al
2049; AVX1-NEXT:    sete %cl
2050; AVX1-NEXT:    vmovd %eax, %xmm0
2051; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2052; AVX1-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2053; AVX1-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2054; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2055; AVX1-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2056; AVX1-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2057; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2058; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2059; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
2060; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2061; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2062; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2063; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2064; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
2065; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2066; AVX1-NEXT:    vmovd %xmm0, %eax
2067; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
2068; AVX1-NEXT:    retq
2069;
2070; AVX2-LABEL: PR66194:
2071; AVX2:       # %bb.0: # %entry
2072; AVX2-NEXT:    xorl %eax, %eax
2073; AVX2-NEXT:    xorl %ecx, %ecx
2074; AVX2-NEXT:    testb %dil, %dil
2075; AVX2-NEXT:    setne %al
2076; AVX2-NEXT:    sete %cl
2077; AVX2-NEXT:    vmovd %eax, %xmm0
2078; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
2079; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2080; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2081; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
2082; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2083; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2084; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
2085; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2086; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2087; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2088; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2089; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2090; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
2091; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2092; AVX2-NEXT:    vmovd %xmm0, %eax
2093; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
2094; AVX2-NEXT:    retq
2095;
2096; AVX512-LABEL: PR66194:
2097; AVX512:       # %bb.0: # %entry
2098; AVX512-NEXT:    xorl %eax, %eax
2099; AVX512-NEXT:    xorl %ecx, %ecx
2100; AVX512-NEXT:    testb %dil, %dil
2101; AVX512-NEXT:    setne %al
2102; AVX512-NEXT:    sete %cl
2103; AVX512-NEXT:    vmovd %eax, %xmm0
2104; AVX512-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
2105; AVX512-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2106; AVX512-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2107; AVX512-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
2108; AVX512-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2109; AVX512-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2110; AVX512-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
2111; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2112; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2113; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2114; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2115; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2116; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
2117; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2118; AVX512-NEXT:    vmovd %xmm0, %eax
2119; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
2120; AVX512-NEXT:    retq
2121entry:
2122  %cmp12.i.13 = icmp ne i8 %q, 0
2123  %cond.i15.13 = zext i1 %cmp12.i.13 to i16
2124  %tobool.not.i.13 = icmp eq i8 %q, 0
2125  %cond18.i.13 = zext i1 %tobool.not.i.13 to i16
2126  %0 = insertelement <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i16 %cond.i15.13, i64 8
2127  %1 = insertelement <16 x i16> %0, i16 %cond18.i.13, i64 9
2128  %2 = insertelement <16 x i16> %1, i16 %cond.i15.13, i64 10
2129  %3 = insertelement <16 x i16> %2, i16 %cond.i15.13, i64 11
2130  %4 = insertelement <16 x i16> %3, i16 %cond18.i.13, i64 12
2131  %5 = insertelement <16 x i16> %4, i16 %cond.i15.13, i64 13
2132  %6 = insertelement <16 x i16> %5, i16 %cond.i15.13, i64 14
2133  %7 = insertelement <16 x i16> %6, i16 %cond18.i.13, i64 15
2134  %8 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
2135  ret i16 %8
2136}
2137declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
2138
2139; Store merging must not infinitely fight store splitting.
2140
2141define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) align 2 {
2142; SSE2-LABEL: store_merge_split:
2143; SSE2:       # %bb.0:
2144; SSE2-NEXT:    pslld $16, %xmm1
2145; SSE2-NEXT:    psrad $16, %xmm1
2146; SSE2-NEXT:    pslld $16, %xmm0
2147; SSE2-NEXT:    psrad $16, %xmm0
2148; SSE2-NEXT:    packssdw %xmm1, %xmm0
2149; SSE2-NEXT:    pslld $16, %xmm3
2150; SSE2-NEXT:    psrad $16, %xmm3
2151; SSE2-NEXT:    pslld $16, %xmm2
2152; SSE2-NEXT:    psrad $16, %xmm2
2153; SSE2-NEXT:    packssdw %xmm3, %xmm2
2154; SSE2-NEXT:    shlq $4, %rdi
2155; SSE2-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2156; SSE2-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2157; SSE2-NEXT:    retq
2158;
2159; SSSE3-LABEL: store_merge_split:
2160; SSSE3:       # %bb.0:
2161; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2162; SSSE3-NEXT:    pshufb %xmm4, %xmm1
2163; SSSE3-NEXT:    pshufb %xmm4, %xmm0
2164; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2165; SSSE3-NEXT:    pshufb %xmm4, %xmm3
2166; SSSE3-NEXT:    pshufb %xmm4, %xmm2
2167; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2168; SSSE3-NEXT:    shlq $4, %rdi
2169; SSSE3-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2170; SSSE3-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2171; SSSE3-NEXT:    retq
2172;
2173; SSE41-LABEL: store_merge_split:
2174; SSE41:       # %bb.0:
2175; SSE41-NEXT:    pxor %xmm4, %xmm4
2176; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
2177; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
2178; SSE41-NEXT:    packusdw %xmm1, %xmm0
2179; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
2180; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
2181; SSE41-NEXT:    packusdw %xmm3, %xmm2
2182; SSE41-NEXT:    shlq $4, %rdi
2183; SSE41-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2184; SSE41-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2185; SSE41-NEXT:    retq
2186;
2187; AVX1-LABEL: store_merge_split:
2188; AVX1:       # %bb.0:
2189; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2190; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2191; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2192; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2193; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2194; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2195; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
2196; AVX1-NEXT:    shlq $4, %rdi
2197; AVX1-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2198; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2199; AVX1-NEXT:    vzeroupper
2200; AVX1-NEXT:    retq
2201;
2202; AVX2-LABEL: store_merge_split:
2203; AVX2:       # %bb.0:
2204; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2205; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2206; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2207; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2208; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2209; AVX2-NEXT:    shlq $4, %rdi
2210; AVX2-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2211; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2212; AVX2-NEXT:    vzeroupper
2213; AVX2-NEXT:    retq
2214;
2215; AVX512F-LABEL: store_merge_split:
2216; AVX512F:       # %bb.0:
2217; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2218; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2219; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2220; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
2221; AVX512F-NEXT:    shlq $4, %rdi
2222; AVX512F-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2223; AVX512F-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2224; AVX512F-NEXT:    vzeroupper
2225; AVX512F-NEXT:    retq
2226;
2227; AVX512VL-LABEL: store_merge_split:
2228; AVX512VL:       # %bb.0:
2229; AVX512VL-NEXT:    shlq $4, %rdi
2230; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2231; AVX512VL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2232; AVX512VL-NEXT:    vzeroupper
2233; AVX512VL-NEXT:    retq
2234;
2235; AVX512BW-LABEL: store_merge_split:
2236; AVX512BW:       # %bb.0:
2237; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2238; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2239; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
2240; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
2241; AVX512BW-NEXT:    shlq $4, %rdi
2242; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2243; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2244; AVX512BW-NEXT:    vzeroupper
2245; AVX512BW-NEXT:    retq
2246;
2247; AVX512BWVL-LABEL: store_merge_split:
2248; AVX512BWVL:       # %bb.0:
2249; AVX512BWVL-NEXT:    shlq $4, %rdi
2250; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2251; AVX512BWVL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2252; AVX512BWVL-NEXT:    vzeroupper
2253; AVX512BWVL-NEXT:    retq
2254  %t1 = trunc <8 x i32> %w1 to <8 x i16>
2255  %t2 = trunc <8 x i32> %w2 to <8 x i16>
2256  %g1 = getelementptr inbounds <8 x i16>, ptr %p, i64 %idx
2257  %g2 = getelementptr inbounds <8 x i16>, ptr %g1, i64 1
2258  store <8 x i16> %t1, ptr %g1, align 2
2259  store <8 x i16> %t2, ptr %g2, align 2
2260  ret void
2261}
2262