xref: /llvm-project/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll (revision f0dd12ec5c0169ba5b4363b62d59511181cf954a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
6
7define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
8; X86-LABEL: test_mm512_kunpackd:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    pushl %ebp
11; X86-NEXT:    .cfi_def_cfa_offset 8
12; X86-NEXT:    .cfi_offset %ebp, -8
13; X86-NEXT:    movl %esp, %ebp
14; X86-NEXT:    .cfi_def_cfa_register %ebp
15; X86-NEXT:    andl $-64, %esp
16; X86-NEXT:    subl $64, %esp
17; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
18; X86-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
19; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
20; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
21; X86-NEXT:    kandd %k0, %k2, %k0
22; X86-NEXT:    kmovd %k0, %eax
23; X86-NEXT:    kshiftrq $32, %k2, %k0
24; X86-NEXT:    kandd %k1, %k0, %k0
25; X86-NEXT:    kmovd %k0, %edx
26; X86-NEXT:    movl %ebp, %esp
27; X86-NEXT:    popl %ebp
28; X86-NEXT:    .cfi_def_cfa %esp, 4
29; X86-NEXT:    vzeroupper
30; X86-NEXT:    retl
31;
32; X64-LABEL: test_mm512_kunpackd:
33; X64:       # %bb.0: # %entry
34; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
35; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
36; X64-NEXT:    kunpckdq %k0, %k1, %k1
37; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
38; X64-NEXT:    kmovq %k0, %rax
39; X64-NEXT:    vzeroupper
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <8 x i64> %__E to <64 x i8>
43  %1 = bitcast <8 x i64> %__F to <64 x i8>
44  %2 = bitcast <8 x i64> %__B to <64 x i8>
45  %3 = bitcast <8 x i64> %__A to <64 x i8>
46  %4 = icmp ne <64 x i8> %2, %3
47  %5 = bitcast <8 x i64> %__C to <64 x i8>
48  %6 = bitcast <8 x i64> %__D to <64 x i8>
49  %7 = icmp ne <64 x i8> %5, %6
50  %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
51  %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
52  %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
53  %11 = icmp ne <64 x i8> %0, %1
54  %12 = and <64 x i1> %11, %10
55  %13 = bitcast <64 x i1> %12 to i64
56  ret i64 %13
57}
58
59define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
60; X86-LABEL: test_mm512_kunpackw:
61; X86:       # %bb.0: # %entry
62; X86-NEXT:    pushl %ebp
63; X86-NEXT:    .cfi_def_cfa_offset 8
64; X86-NEXT:    .cfi_offset %ebp, -8
65; X86-NEXT:    movl %esp, %ebp
66; X86-NEXT:    .cfi_def_cfa_register %ebp
67; X86-NEXT:    andl $-64, %esp
68; X86-NEXT:    subl $64, %esp
69; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
70; X86-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
71; X86-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
72; X86-NEXT:    kunpckwd %k0, %k1, %k1
73; X86-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
74; X86-NEXT:    kmovd %k0, %eax
75; X86-NEXT:    movl %ebp, %esp
76; X86-NEXT:    popl %ebp
77; X86-NEXT:    .cfi_def_cfa %esp, 4
78; X86-NEXT:    vzeroupper
79; X86-NEXT:    retl
80;
81; X64-LABEL: test_mm512_kunpackw:
82; X64:       # %bb.0: # %entry
83; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
84; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
85; X64-NEXT:    kunpckwd %k0, %k1, %k1
86; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
87; X64-NEXT:    kmovd %k0, %eax
88; X64-NEXT:    vzeroupper
89; X64-NEXT:    retq
90entry:
91  %0 = bitcast <8 x i64> %__E to <32 x i16>
92  %1 = bitcast <8 x i64> %__F to <32 x i16>
93  %2 = bitcast <8 x i64> %__B to <32 x i16>
94  %3 = bitcast <8 x i64> %__A to <32 x i16>
95  %4 = icmp ne <32 x i16> %2, %3
96  %5 = bitcast <8 x i64> %__C to <32 x i16>
97  %6 = bitcast <8 x i64> %__D to <32 x i16>
98  %7 = icmp ne <32 x i16> %5, %6
99  %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
100  %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
101  %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
102  %11 = icmp ne <32 x i16> %0, %1
103  %12 = and <32 x i1> %11, %10
104  %13 = bitcast <32 x i1> %12 to i32
105  ret i32 %13
106}
107
108
109define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
110; X86-LABEL: test_mm512_mask_set1_epi8:
111; X86:       # %bb.0: # %entry
112; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
113; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
114; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
115; X86-NEXT:    kunpckdq %k1, %k0, %k1
116; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
117; X86-NEXT:    retl
118;
119; X64-LABEL: test_mm512_mask_set1_epi8:
120; X64:       # %bb.0: # %entry
121; X64-NEXT:    kmovq %rdi, %k1
122; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1}
123; X64-NEXT:    retq
124  entry:
125  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
126  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
127  %0 = bitcast <8 x i64> %__O to <64 x i8>
128  %1 = bitcast i64 %__M to <64 x i1>
129  %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0
130  %3 = bitcast <64 x i8> %2 to <8 x i64>
131  ret <8 x i64> %3
132}
133
134define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
135; X86-LABEL: test_mm512_maskz_set1_epi8:
136; X86:       # %bb.0: # %entry
137; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
138; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
139; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
140; X86-NEXT:    kunpckdq %k1, %k0, %k1
141; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
142; X86-NEXT:    retl
143;
144; X64-LABEL: test_mm512_maskz_set1_epi8:
145; X64:       # %bb.0: # %entry
146; X64-NEXT:    kmovq %rdi, %k1
147; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1} {z}
148; X64-NEXT:    retq
149  entry:
150  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
151  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
152  %0 = bitcast i64 %__M to <64 x i1>
153  %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer
154  %2 = bitcast <64 x i8> %1 to <8 x i64>
155  ret <8 x i64> %2
156}
157
158define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A)  {
159; X86-LABEL: test_mm512_mask_set1_epi16:
160; X86:       # %bb.0: # %entry
161; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
162; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
163; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
164; X86-NEXT:    retl
165;
166; X64-LABEL: test_mm512_mask_set1_epi16:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    kmovd %edi, %k1
169; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1}
170; X64-NEXT:    retq
171  entry:
172  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
173  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
174  %0 = bitcast <8 x i64> %__O to <32 x i16>
175  %1 = bitcast i32 %__M to <32 x i1>
176  %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0
177  %3 = bitcast <32 x i16> %2 to <8 x i64>
178  ret <8 x i64> %3
179}
180
181define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A)  {
182; X86-LABEL: test_mm512_maskz_set1_epi16:
183; X86:       # %bb.0: # %entry
184; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
185; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
186; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z}
187; X86-NEXT:    retl
188;
189; X64-LABEL: test_mm512_maskz_set1_epi16:
190; X64:       # %bb.0: # %entry
191; X64-NEXT:    kmovd %edi, %k1
192; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1} {z}
193; X64-NEXT:    retq
194  entry:
195  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
196  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
197  %0 = bitcast i32 %__M to <32 x i1>
198  %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer
199  %2 = bitcast <32 x i16> %1 to <8 x i64>
200  ret <8 x i64> %2
201}
202
203define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
204; CHECK-LABEL: test_mm512_broadcastb_epi8:
205; CHECK:       # %bb.0:
206; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0
207; CHECK-NEXT:    ret{{[l|q]}}
208  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
209  %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
210  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
211  ret <8 x i64> %res1
212}
213
214define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, ptr %a1, <2 x i64> %a2) {
215; X86-LABEL: test_mm512_mask_broadcastb_epi8:
216; X86:       # %bb.0:
217; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
218; X86-NEXT:    kmovq (%eax), %k1
219; X86-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
220; X86-NEXT:    retl
221;
222; X64-LABEL: test_mm512_mask_broadcastb_epi8:
223; X64:       # %bb.0:
224; X64-NEXT:    kmovq (%rdi), %k1
225; X64-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
226; X64-NEXT:    retq
227  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
228  %arg1 = load <64 x i1>, ptr %a1
229  %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
230  %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer
231  %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0
232  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
233  ret <8 x i64> %res2
234}
235
236define <8 x i64> @test_mm512_maskz_broadcastb_epi8(ptr %a0, <2 x i64> %a1) {
237; X86-LABEL: test_mm512_maskz_broadcastb_epi8:
238; X86:       # %bb.0:
239; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
240; X86-NEXT:    kmovq (%eax), %k1
241; X86-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
242; X86-NEXT:    retl
243;
244; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
245; X64:       # %bb.0:
246; X64-NEXT:    kmovq (%rdi), %k1
247; X64-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
248; X64-NEXT:    retq
249  %arg0 = load <64 x i1>, ptr %a0
250  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
251  %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer
252  %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer
253  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
254  ret <8 x i64> %res2
255}
256
257define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
258; CHECK-LABEL: test_mm512_broadcastw_epi16:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
261; CHECK-NEXT:    ret{{[l|q]}}
262  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
263  %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
264  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
265  ret <8 x i64> %res1
266}
267
268define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
269; X86-LABEL: test_mm512_mask_broadcastw_epi16:
270; X86:       # %bb.0:
271; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
272; X86-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
273; X86-NEXT:    retl
274;
275; X64-LABEL: test_mm512_mask_broadcastw_epi16:
276; X64:       # %bb.0:
277; X64-NEXT:    kmovd %edi, %k1
278; X64-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
279; X64-NEXT:    retq
280  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
281  %arg1 = bitcast i32 %a1 to <32 x i1>
282  %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
283  %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer
284  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
285  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
286  ret <8 x i64> %res2
287}
288
289define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
290; X86-LABEL: test_mm512_maskz_broadcastw_epi16:
291; X86:       # %bb.0:
292; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
293; X86-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
294; X86-NEXT:    retl
295;
296; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
297; X64:       # %bb.0:
298; X64-NEXT:    kmovd %edi, %k1
299; X64-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
300; X64-NEXT:    retq
301  %arg0 = bitcast i32 %a0 to <32 x i1>
302  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
303  %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer
304  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
305  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
306  ret <8 x i64> %res2
307}
308
309define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
310; CHECK-LABEL: test_mm512_bslli_epi128:
311; CHECK:       # %bb.0:
312; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
313; CHECK-NEXT:    ret{{[l|q]}}
314  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
315  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
316  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
317  ret <8 x i64> %res1
318}
319
320define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
321; CHECK-LABEL: test_mm512_bsrli_epi128:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
324; CHECK-NEXT:    ret{{[l|q]}}
325  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
326  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
327  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
328  ret <8 x i64> %res1
329}
330
331define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
332; CHECK-LABEL: test_mm512_unpackhi_epi8:
333; CHECK:       # %bb.0:
334; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
335; CHECK-NEXT:    ret{{[l|q]}}
336  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
337  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
338  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
339  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
340  ret <8 x i64> %res1
341}
342
343; TODO - improve support for i64 -> mmask64 on 32-bit targets
344define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, ptr %a1, <8 x i64> %a2, <8 x i64> %a3) {
345; X86-LABEL: test_mm512_mask_unpackhi_epi8:
346; X86:       # %bb.0:
347; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
348; X86-NEXT:    kmovq (%eax), %k1
349; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
350; X86-NEXT:    retl
351;
352; X64-LABEL: test_mm512_mask_unpackhi_epi8:
353; X64:       # %bb.0:
354; X64-NEXT:    kmovq (%rdi), %k1
355; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
356; X64-NEXT:    retq
357  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
358  %sel1 = load <64 x i1>, ptr %a1
359  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
360  %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
361  %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
362  %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
363  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
364  ret <8 x i64> %res2
365}
366
367define <8 x i64> @test_mm512_maskz_unpackhi_epi8(ptr %a0, <8 x i64> %a1, <8 x i64> %a2) {
368; X86-LABEL: test_mm512_maskz_unpackhi_epi8:
369; X86:       # %bb.0:
370; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
371; X86-NEXT:    kmovq (%eax), %k1
372; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
373; X86-NEXT:    retl
374;
375; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
376; X64:       # %bb.0:
377; X64-NEXT:    kmovq (%rdi), %k1
378; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
379; X64-NEXT:    retq
380  %sel0 = load <64 x i1>, ptr %a0
381  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
382  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
383  %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
384  %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
385  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
386  ret <8 x i64> %res2
387}
388
389define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
390; CHECK-LABEL: test_mm512_unpackhi_epi16:
391; CHECK:       # %bb.0:
392; CHECK-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
393; CHECK-NEXT:    ret{{[l|q]}}
394  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
395  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
396  %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
397  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
398  ret <8 x i64> %res1
399}
400
401define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
402; X86-LABEL: test_mm512_mask_unpackhi_epi16:
403; X86:       # %bb.0:
404; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
405; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
406; X86-NEXT:    retl
407;
408; X64-LABEL: test_mm512_mask_unpackhi_epi16:
409; X64:       # %bb.0:
410; X64-NEXT:    kmovd %edi, %k1
411; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
412; X64-NEXT:    retq
413  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
414  %arg1 = bitcast i32 %a1 to <32 x i1>
415  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
416  %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
417  %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
418  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
419  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
420  ret <8 x i64> %res2
421}
422
423define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
424; X86-LABEL: test_mm512_maskz_unpackhi_epi16:
425; X86:       # %bb.0:
426; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
427; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
428; X86-NEXT:    retl
429;
430; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
431; X64:       # %bb.0:
432; X64-NEXT:    kmovd %edi, %k1
433; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
434; X64-NEXT:    retq
435  %arg0 = bitcast i32 %a0 to <32 x i1>
436  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
437  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
438  %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
439  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
440  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
441  ret <8 x i64> %res2
442}
443
444define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
445; CHECK-LABEL: test_mm512_unpacklo_epi8:
446; CHECK:       # %bb.0:
447; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
448; CHECK-NEXT:    ret{{[l|q]}}
449  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
450  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
451  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
452  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
453  ret <8 x i64> %res1
454}
455
456define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, ptr %a1, <8 x i64> %a2, <8 x i64> %a3) {
457; X86-LABEL: test_mm512_mask_unpacklo_epi8:
458; X86:       # %bb.0:
459; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
460; X86-NEXT:    kmovq (%eax), %k1
461; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
462; X86-NEXT:    retl
463;
464; X64-LABEL: test_mm512_mask_unpacklo_epi8:
465; X64:       # %bb.0:
466; X64-NEXT:    kmovq (%rdi), %k1
467; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
468; X64-NEXT:    retq
469  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
470  %sel1 = load <64 x i1>, ptr %a1
471  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
472  %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
473  %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
474  %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
475  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
476  ret <8 x i64> %res2
477}
478
479define <8 x i64> @test_mm512_maskz_unpacklo_epi8(ptr %a0, <8 x i64> %a1, <8 x i64> %a2) {
480; X86-LABEL: test_mm512_maskz_unpacklo_epi8:
481; X86:       # %bb.0:
482; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
483; X86-NEXT:    kmovq (%eax), %k1
484; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
485; X86-NEXT:    retl
486;
487; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
488; X64:       # %bb.0:
489; X64-NEXT:    kmovq (%rdi), %k1
490; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
491; X64-NEXT:    retq
492  %sel0 = load <64 x i1>, ptr %a0
493  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
494  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
495  %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
496  %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
497  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
498  ret <8 x i64> %res2
499}
500
501define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
502; CHECK-LABEL: test_mm512_unpacklo_epi16:
503; CHECK:       # %bb.0:
504; CHECK-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
505; CHECK-NEXT:    ret{{[l|q]}}
506  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
507  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
508  %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
509  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
510  ret <8 x i64> %res1
511}
512
513define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
514; X86-LABEL: test_mm512_mask_unpacklo_epi16:
515; X86:       # %bb.0:
516; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
517; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
518; X86-NEXT:    retl
519;
520; X64-LABEL: test_mm512_mask_unpacklo_epi16:
521; X64:       # %bb.0:
522; X64-NEXT:    kmovd %edi, %k1
523; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
524; X64-NEXT:    retq
525  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
526  %arg1 = bitcast i32 %a1 to <32 x i1>
527  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
528  %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
529  %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
530  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
531  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
532  ret <8 x i64> %res2
533}
534
535define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
536; X86-LABEL: test_mm512_maskz_unpacklo_epi16:
537; X86:       # %bb.0:
538; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
539; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
540; X86-NEXT:    retl
541;
542; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
543; X64:       # %bb.0:
544; X64-NEXT:    kmovd %edi, %k1
545; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
546; X64-NEXT:    retq
547  %arg0 = bitcast i32 %a0 to <32 x i1>
548  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
549  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
550  %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
551  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
552  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
553  ret <8 x i64> %res2
554}
555
556define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
557; X86-LABEL: test_mm512_test_epi8_mask:
558; X86:       # %bb.0: # %entry
559; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
560; X86-NEXT:    kshiftrq $32, %k0, %k1
561; X86-NEXT:    kmovd %k0, %eax
562; X86-NEXT:    kmovd %k1, %edx
563; X86-NEXT:    vzeroupper
564; X86-NEXT:    retl
565;
566; X64-LABEL: test_mm512_test_epi8_mask:
567; X64:       # %bb.0: # %entry
568; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0
569; X64-NEXT:    kmovq %k0, %rax
570; X64-NEXT:    vzeroupper
571; X64-NEXT:    retq
572entry:
573  %and1.i.i = and <8 x i64> %__B, %__A
574  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
575  %1 = icmp ne <64 x i8> %0, zeroinitializer
576  %2 = bitcast <64 x i1> %1 to i64
577  ret i64 %2
578}
579
580define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
581; X86-LABEL: test_mm512_mask_test_epi8_mask:
582; X86:       # %bb.0: # %entry
583; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
584; X86-NEXT:    kshiftrq $32, %k0, %k1
585; X86-NEXT:    kmovd %k1, %edx
586; X86-NEXT:    kmovd %k0, %eax
587; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
588; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
589; X86-NEXT:    vzeroupper
590; X86-NEXT:    retl
591;
592; X64-LABEL: test_mm512_mask_test_epi8_mask:
593; X64:       # %bb.0: # %entry
594; X64-NEXT:    kmovq %rdi, %k1
595; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0 {%k1}
596; X64-NEXT:    kmovq %k0, %rax
597; X64-NEXT:    vzeroupper
598; X64-NEXT:    retq
599entry:
600  %and1.i.i = and <8 x i64> %__B, %__A
601  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
602  %1 = icmp ne <64 x i8> %0, zeroinitializer
603  %2 = bitcast i64 %__U to <64 x i1>
604  %3 = and <64 x i1> %1, %2
605  %4 = bitcast <64 x i1> %3 to i64
606  ret i64 %4
607}
608
609define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
610; CHECK-LABEL: test_mm512_test_epi16_mask:
611; CHECK:       # %bb.0: # %entry
612; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0
613; CHECK-NEXT:    kmovd %k0, %eax
614; CHECK-NEXT:    vzeroupper
615; CHECK-NEXT:    ret{{[l|q]}}
616entry:
617  %and1.i.i = and <8 x i64> %__B, %__A
618  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
619  %1 = icmp ne <32 x i16> %0, zeroinitializer
620  %2 = bitcast <32 x i1> %1 to i32
621  ret i32 %2
622}
623
624define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
625; X86-LABEL: test_mm512_mask_test_epi16_mask:
626; X86:       # %bb.0: # %entry
627; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
628; X86-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
629; X86-NEXT:    kmovd %k0, %eax
630; X86-NEXT:    vzeroupper
631; X86-NEXT:    retl
632;
633; X64-LABEL: test_mm512_mask_test_epi16_mask:
634; X64:       # %bb.0: # %entry
635; X64-NEXT:    kmovd %edi, %k1
636; X64-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
637; X64-NEXT:    kmovd %k0, %eax
638; X64-NEXT:    vzeroupper
639; X64-NEXT:    retq
640entry:
641  %and1.i.i = and <8 x i64> %__B, %__A
642  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
643  %1 = icmp ne <32 x i16> %0, zeroinitializer
644  %2 = bitcast i32 %__U to <32 x i1>
645  %3 = and <32 x i1> %1, %2
646  %4 = bitcast <32 x i1> %3 to i32
647  ret i32 %4
648}
649
650define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
651; X86-LABEL: test_mm512_testn_epi8_mask:
652; X86:       # %bb.0: # %entry
653; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
654; X86-NEXT:    kshiftrq $32, %k0, %k1
655; X86-NEXT:    kmovd %k0, %eax
656; X86-NEXT:    kmovd %k1, %edx
657; X86-NEXT:    vzeroupper
658; X86-NEXT:    retl
659;
660; X64-LABEL: test_mm512_testn_epi8_mask:
661; X64:       # %bb.0: # %entry
662; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0
663; X64-NEXT:    kmovq %k0, %rax
664; X64-NEXT:    vzeroupper
665; X64-NEXT:    retq
666entry:
667  %and1.i.i = and <8 x i64> %__B, %__A
668  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
669  %1 = icmp eq <64 x i8> %0, zeroinitializer
670  %2 = bitcast <64 x i1> %1 to i64
671  ret i64 %2
672}
673
674define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
675; X86-LABEL: test_mm512_mask_testn_epi8_mask:
676; X86:       # %bb.0: # %entry
677; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
678; X86-NEXT:    kshiftrq $32, %k0, %k1
679; X86-NEXT:    kmovd %k1, %edx
680; X86-NEXT:    kmovd %k0, %eax
681; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
682; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
683; X86-NEXT:    vzeroupper
684; X86-NEXT:    retl
685;
686; X64-LABEL: test_mm512_mask_testn_epi8_mask:
687; X64:       # %bb.0: # %entry
688; X64-NEXT:    kmovq %rdi, %k1
689; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0 {%k1}
690; X64-NEXT:    kmovq %k0, %rax
691; X64-NEXT:    vzeroupper
692; X64-NEXT:    retq
693entry:
694  %and1.i.i = and <8 x i64> %__B, %__A
695  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
696  %1 = icmp eq <64 x i8> %0, zeroinitializer
697  %2 = bitcast i64 %__U to <64 x i1>
698  %3 = and <64 x i1> %1, %2
699  %4 = bitcast <64 x i1> %3 to i64
700  ret i64 %4
701}
702
703define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
704; CHECK-LABEL: test_mm512_testn_epi16_mask:
705; CHECK:       # %bb.0: # %entry
706; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0
707; CHECK-NEXT:    kmovd %k0, %eax
708; CHECK-NEXT:    vzeroupper
709; CHECK-NEXT:    ret{{[l|q]}}
710entry:
711  %and1.i.i = and <8 x i64> %__B, %__A
712  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
713  %1 = icmp eq <32 x i16> %0, zeroinitializer
714  %2 = bitcast <32 x i1> %1 to i32
715  ret i32 %2
716}
717
718define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
719; X86-LABEL: test_mm512_mask_testn_epi16_mask:
720; X86:       # %bb.0: # %entry
721; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
722; X86-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
723; X86-NEXT:    kmovd %k0, %eax
724; X86-NEXT:    vzeroupper
725; X86-NEXT:    retl
726;
727; X64-LABEL: test_mm512_mask_testn_epi16_mask:
728; X64:       # %bb.0: # %entry
729; X64-NEXT:    kmovd %edi, %k1
730; X64-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
731; X64-NEXT:    kmovd %k0, %eax
732; X64-NEXT:    vzeroupper
733; X64-NEXT:    retq
734entry:
735  %and1.i.i = and <8 x i64> %__B, %__A
736  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
737  %1 = icmp eq <32 x i16> %0, zeroinitializer
738  %2 = bitcast i32 %__U to <32 x i1>
739  %3 = and <32 x i1> %1, %2
740  %4 = bitcast <32 x i1> %3 to i32
741  ret i32 %4
742}
743
744define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) {
745; CHECK-LABEL: test_mm512_cvtepi16_epi8:
746; CHECK:       # %bb.0: # %entry
747; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
748; CHECK-NEXT:    ret{{[l|q]}}
749entry:
750  %0 = bitcast <8 x i64> %__A to <32 x i16>
751  %conv.i = trunc <32 x i16> %0 to <32 x i8>
752  %1 = bitcast <32 x i8> %conv.i to <4 x i64>
753  ret <4 x i64> %1
754}
755
756define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) {
757; X86-LABEL: test_mm512_mask_cvtepi16_epi8:
758; X86:       # %bb.0: # %entry
759; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
760; X86-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
761; X86-NEXT:    retl
762;
763; X64-LABEL: test_mm512_mask_cvtepi16_epi8:
764; X64:       # %bb.0: # %entry
765; X64-NEXT:    kmovd %edi, %k1
766; X64-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
767; X64-NEXT:    retq
768entry:
769  %0 = bitcast <8 x i64> %__A to <32 x i16>
770  %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
771  %1 = bitcast <4 x i64> %__O to <32 x i8>
772  %2 = bitcast i32 %__M to <32 x i1>
773  %3 = select <32 x i1> %2, <32 x i8> %conv.i.i, <32 x i8> %1
774  %4 = bitcast <32 x i8> %3 to <4 x i64>
775  ret <4 x i64> %4
776}
777
778define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) {
779; X86-LABEL: test_mm512_maskz_cvtepi16_epi8:
780; X86:       # %bb.0: # %entry
781; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
782; X86-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
783; X86-NEXT:    retl
784;
785; X64-LABEL: test_mm512_maskz_cvtepi16_epi8:
786; X64:       # %bb.0: # %entry
787; X64-NEXT:    kmovd %edi, %k1
788; X64-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
789; X64-NEXT:    retq
790entry:
791  %0 = bitcast <8 x i64> %__A to <32 x i16>
792  %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
793  %1 = bitcast i32 %__M to <32 x i1>
794  %2 = select <32 x i1> %1, <32 x i8> %conv.i.i, <32 x i8> zeroinitializer
795  %3 = bitcast <32 x i8> %2 to <4 x i64>
796  ret <4 x i64> %3
797}
798
799define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) {
800; X86-LABEL: test_mm512_mask2_permutex2var_epi16:
801; X86:       # %bb.0: # %entry
802; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
803; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
804; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
805; X86-NEXT:    retl
806;
807; X64-LABEL: test_mm512_mask2_permutex2var_epi16:
808; X64:       # %bb.0: # %entry
809; X64-NEXT:    kmovd %edi, %k1
810; X64-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
811; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
812; X64-NEXT:    retq
813entry:
814  %0 = bitcast <8 x i64> %__A to <32 x i16>
815  %1 = bitcast <8 x i64> %__I to <32 x i16>
816  %2 = bitcast <8 x i64> %__B to <32 x i16>
817  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
818  %4 = bitcast i32 %__U to <32 x i1>
819  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %1
820  %6 = bitcast <32 x i16> %5 to <8 x i64>
821  ret <8 x i64> %6
822}
823
824define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
825; CHECK-LABEL: test_mm512_permutex2var_epi16:
826; CHECK:       # %bb.0: # %entry
827; CHECK-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
828; CHECK-NEXT:    ret{{[l|q]}}
829entry:
830  %0 = bitcast <8 x i64> %__A to <32 x i16>
831  %1 = bitcast <8 x i64> %__I to <32 x i16>
832  %2 = bitcast <8 x i64> %__B to <32 x i16>
833  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
834  %4 = bitcast <32 x i16> %3 to <8 x i64>
835  ret <8 x i64> %4
836}
837
838define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) {
839; X86-LABEL: test_mm512_mask_permutex2var_epi16:
840; X86:       # %bb.0: # %entry
841; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
842; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
843; X86-NEXT:    retl
844;
845; X64-LABEL: test_mm512_mask_permutex2var_epi16:
846; X64:       # %bb.0: # %entry
847; X64-NEXT:    kmovd %edi, %k1
848; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
849; X64-NEXT:    retq
850entry:
851  %0 = bitcast <8 x i64> %__A to <32 x i16>
852  %1 = bitcast <8 x i64> %__I to <32 x i16>
853  %2 = bitcast <8 x i64> %__B to <32 x i16>
854  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
855  %4 = bitcast i32 %__U to <32 x i1>
856  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %0
857  %6 = bitcast <32 x i16> %5 to <8 x i64>
858  ret <8 x i64> %6
859}
860
861define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
862; X86-LABEL: test_mm512_maskz_permutex2var_epi16:
863; X86:       # %bb.0: # %entry
864; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
865; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
866; X86-NEXT:    retl
867;
868; X64-LABEL: test_mm512_maskz_permutex2var_epi16:
869; X64:       # %bb.0: # %entry
870; X64-NEXT:    kmovd %edi, %k1
871; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
872; X64-NEXT:    retq
873entry:
874  %0 = bitcast <8 x i64> %__A to <32 x i16>
875  %1 = bitcast <8 x i64> %__I to <32 x i16>
876  %2 = bitcast <8 x i64> %__B to <32 x i16>
877  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
878  %4 = bitcast i32 %__U to <32 x i1>
879  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
880  %6 = bitcast <32 x i16> %5 to <8 x i64>
881  ret <8 x i64> %6
882}
883
884declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
885
886!0 = !{i32 1}
887
888