xref: /llvm-project/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll (revision f0dd12ec5c0169ba5b4363b62d59511181cf954a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlvbmi2-builtins.c
6
7define <2 x i64> @test_mm_mask_compress_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
8; X86-LABEL: test_mm_mask_compress_epi16:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
11; X86-NEXT:    kmovd %eax, %k1
12; X86-NEXT:    vpcompressw %xmm1, %xmm0 {%k1}
13; X86-NEXT:    retl
14;
15; X64-LABEL: test_mm_mask_compress_epi16:
16; X64:       # %bb.0: # %entry
17; X64-NEXT:    kmovd %edi, %k1
18; X64-NEXT:    vpcompressw %xmm1, %xmm0 {%k1}
19; X64-NEXT:    retq
20entry:
21  %0 = bitcast <2 x i64> %__D to <8 x i16>
22  %1 = bitcast <2 x i64> %__S to <8 x i16>
23  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
24  %3 = bitcast <8 x i16> %2 to <2 x i64>
25  ret <2 x i64> %3
26}
27
28define <2 x i64> @test_mm_maskz_compress_epi16(i8 zeroext %__U, <2 x i64> %__D) {
29; X86-LABEL: test_mm_maskz_compress_epi16:
30; X86:       # %bb.0: # %entry
31; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
32; X86-NEXT:    kmovd %eax, %k1
33; X86-NEXT:    vpcompressw %xmm0, %xmm0 {%k1} {z}
34; X86-NEXT:    retl
35;
36; X64-LABEL: test_mm_maskz_compress_epi16:
37; X64:       # %bb.0: # %entry
38; X64-NEXT:    kmovd %edi, %k1
39; X64-NEXT:    vpcompressw %xmm0, %xmm0 {%k1} {z}
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <2 x i64> %__D to <8 x i16>
43  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
44  %2 = bitcast <8 x i16> %1 to <2 x i64>
45  ret <2 x i64> %2
46}
47
48define <2 x i64> @test_mm_mask_compress_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
49; X86-LABEL: test_mm_mask_compress_epi8:
50; X86:       # %bb.0: # %entry
51; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
52; X86-NEXT:    vpcompressb %xmm1, %xmm0 {%k1}
53; X86-NEXT:    retl
54;
55; X64-LABEL: test_mm_mask_compress_epi8:
56; X64:       # %bb.0: # %entry
57; X64-NEXT:    kmovd %edi, %k1
58; X64-NEXT:    vpcompressb %xmm1, %xmm0 {%k1}
59; X64-NEXT:    retq
60entry:
61  %0 = bitcast <2 x i64> %__D to <16 x i8>
62  %1 = bitcast <2 x i64> %__S to <16 x i8>
63  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
64  %3 = bitcast <16 x i8> %2 to <2 x i64>
65  ret <2 x i64> %3
66}
67
68define <2 x i64> @test_mm_maskz_compress_epi8(i16 zeroext %__U, <2 x i64> %__D) {
69; X86-LABEL: test_mm_maskz_compress_epi8:
70; X86:       # %bb.0: # %entry
71; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
72; X86-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
73; X86-NEXT:    retl
74;
75; X64-LABEL: test_mm_maskz_compress_epi8:
76; X64:       # %bb.0: # %entry
77; X64-NEXT:    kmovd %edi, %k1
78; X64-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
79; X64-NEXT:    retq
80entry:
81  %0 = bitcast <2 x i64> %__D to <16 x i8>
82  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
83  %2 = bitcast <16 x i8> %1 to <2 x i64>
84  ret <2 x i64> %2
85}
86
87define void @test_mm_mask_compressstoreu_epi16(ptr %__P, i8 zeroext %__U, <2 x i64> %__D) {
88; X86-LABEL: test_mm_mask_compressstoreu_epi16:
89; X86:       # %bb.0: # %entry
90; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
91; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
92; X86-NEXT:    kmovd %eax, %k1
93; X86-NEXT:    vpcompressw %xmm0, (%ecx) {%k1}
94; X86-NEXT:    retl
95;
96; X64-LABEL: test_mm_mask_compressstoreu_epi16:
97; X64:       # %bb.0: # %entry
98; X64-NEXT:    kmovd %esi, %k1
99; X64-NEXT:    vpcompressw %xmm0, (%rdi) {%k1}
100; X64-NEXT:    retq
101entry:
102  %0 = bitcast <2 x i64> %__D to <8 x i16>
103  %1 = bitcast i8 %__U to <8 x i1>
104  tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %0, ptr %__P, <8 x i1> %1)
105  ret void
106}
107
108define void @test_mm_mask_compressstoreu_epi8(ptr %__P, i16 zeroext %__U, <2 x i64> %__D) {
109; X86-LABEL: test_mm_mask_compressstoreu_epi8:
110; X86:       # %bb.0: # %entry
111; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
112; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
113; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1}
114; X86-NEXT:    retl
115;
116; X64-LABEL: test_mm_mask_compressstoreu_epi8:
117; X64:       # %bb.0: # %entry
118; X64-NEXT:    kmovd %esi, %k1
119; X64-NEXT:    vpcompressb %xmm0, (%rdi) {%k1}
120; X64-NEXT:    retq
121entry:
122  %0 = bitcast <2 x i64> %__D to <16 x i8>
123  %1 = bitcast i16 %__U to <16 x i1>
124  tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %0, ptr %__P, <16 x i1> %1)
125  ret void
126}
127
128define <2 x i64> @test_mm_mask_expand_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
129; X86-LABEL: test_mm_mask_expand_epi16:
130; X86:       # %bb.0: # %entry
131; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
132; X86-NEXT:    kmovd %eax, %k1
133; X86-NEXT:    vpexpandw %xmm1, %xmm0 {%k1}
134; X86-NEXT:    retl
135;
136; X64-LABEL: test_mm_mask_expand_epi16:
137; X64:       # %bb.0: # %entry
138; X64-NEXT:    kmovd %edi, %k1
139; X64-NEXT:    vpexpandw %xmm1, %xmm0 {%k1}
140; X64-NEXT:    retq
141entry:
142  %0 = bitcast <2 x i64> %__D to <8 x i16>
143  %1 = bitcast <2 x i64> %__S to <8 x i16>
144  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
145  %3 = bitcast <8 x i16> %2 to <2 x i64>
146  ret <2 x i64> %3
147}
148
149define <2 x i64> @test_mm_maskz_expand_epi16(i8 zeroext %__U, <2 x i64> %__D) {
150; X86-LABEL: test_mm_maskz_expand_epi16:
151; X86:       # %bb.0: # %entry
152; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
153; X86-NEXT:    kmovd %eax, %k1
154; X86-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
155; X86-NEXT:    retl
156;
157; X64-LABEL: test_mm_maskz_expand_epi16:
158; X64:       # %bb.0: # %entry
159; X64-NEXT:    kmovd %edi, %k1
160; X64-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
161; X64-NEXT:    retq
162entry:
163  %0 = bitcast <2 x i64> %__D to <8 x i16>
164  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
165  %2 = bitcast <8 x i16> %1 to <2 x i64>
166  ret <2 x i64> %2
167}
168
169define <2 x i64> @test_mm_mask_expand_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
170; X86-LABEL: test_mm_mask_expand_epi8:
171; X86:       # %bb.0: # %entry
172; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
173; X86-NEXT:    vpexpandb %xmm1, %xmm0 {%k1}
174; X86-NEXT:    retl
175;
176; X64-LABEL: test_mm_mask_expand_epi8:
177; X64:       # %bb.0: # %entry
178; X64-NEXT:    kmovd %edi, %k1
179; X64-NEXT:    vpexpandb %xmm1, %xmm0 {%k1}
180; X64-NEXT:    retq
181entry:
182  %0 = bitcast <2 x i64> %__D to <16 x i8>
183  %1 = bitcast <2 x i64> %__S to <16 x i8>
184  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
185  %3 = bitcast <16 x i8> %2 to <2 x i64>
186  ret <2 x i64> %3
187}
188
189define <2 x i64> @test_mm_maskz_expand_epi8(i16 zeroext %__U, <2 x i64> %__D) {
190; X86-LABEL: test_mm_maskz_expand_epi8:
191; X86:       # %bb.0: # %entry
192; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
193; X86-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
194; X86-NEXT:    retl
195;
196; X64-LABEL: test_mm_maskz_expand_epi8:
197; X64:       # %bb.0: # %entry
198; X64-NEXT:    kmovd %edi, %k1
199; X64-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
200; X64-NEXT:    retq
201entry:
202  %0 = bitcast <2 x i64> %__D to <16 x i8>
203  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
204  %2 = bitcast <16 x i8> %1 to <2 x i64>
205  ret <2 x i64> %2
206}
207
208define <2 x i64> @test_mm_mask_expandloadu_epi16(<2 x i64> %__S, i8 zeroext %__U, ptr readonly %__P) {
209; X86-LABEL: test_mm_mask_expandloadu_epi16:
210; X86:       # %bb.0: # %entry
211; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
212; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
213; X86-NEXT:    kmovd %ecx, %k1
214; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1}
215; X86-NEXT:    retl
216;
217; X64-LABEL: test_mm_mask_expandloadu_epi16:
218; X64:       # %bb.0: # %entry
219; X64-NEXT:    kmovd %edi, %k1
220; X64-NEXT:    vpexpandw (%rsi), %xmm0 {%k1}
221; X64-NEXT:    retq
222entry:
223  %0 = bitcast <2 x i64> %__S to <8 x i16>
224  %1 = bitcast i8 %__U to <8 x i1>
225  %2 = tail call <8 x i16> @llvm.masked.expandload.v8i16(ptr %__P, <8 x i1> %1, <8 x i16> %0)
226  %3 = bitcast <8 x i16> %2 to <2 x i64>
227  ret <2 x i64> %3
228}
229
230define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, ptr readonly %__P) {
231; X86-LABEL: test_mm_maskz_expandloadu_epi16:
232; X86:       # %bb.0: # %entry
233; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
234; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
235; X86-NEXT:    kmovd %ecx, %k1
236; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} {z}
237; X86-NEXT:    retl
238;
239; X64-LABEL: test_mm_maskz_expandloadu_epi16:
240; X64:       # %bb.0: # %entry
241; X64-NEXT:    kmovd %edi, %k1
242; X64-NEXT:    vpexpandw (%rsi), %xmm0 {%k1} {z}
243; X64-NEXT:    retq
244entry:
245  %0 = bitcast i8 %__U to <8 x i1>
246  %1 = tail call <8 x i16> @llvm.masked.expandload.v8i16(ptr %__P, <8 x i1> %0, <8 x i16> zeroinitializer)
247  %2 = bitcast <8 x i16> %1 to <2 x i64>
248  ret <2 x i64> %2
249}
250
251define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U, ptr readonly %__P) {
252; X86-LABEL: test_mm_mask_expandloadu_epi8:
253; X86:       # %bb.0: # %entry
254; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
255; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
256; X86-NEXT:    vpexpandb (%eax), %xmm0 {%k1}
257; X86-NEXT:    retl
258;
259; X64-LABEL: test_mm_mask_expandloadu_epi8:
260; X64:       # %bb.0: # %entry
261; X64-NEXT:    kmovd %edi, %k1
262; X64-NEXT:    vpexpandb (%rsi), %xmm0 {%k1}
263; X64-NEXT:    retq
264entry:
265  %0 = bitcast <2 x i64> %__S to <16 x i8>
266  %1 = bitcast i16 %__U to <16 x i1>
267  %2 = tail call <16 x i8> @llvm.masked.expandload.v16i8(ptr %__P, <16 x i1> %1, <16 x i8> %0)
268  %3 = bitcast <16 x i8> %2 to <2 x i64>
269  ret <2 x i64> %3
270}
271
272define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, ptr readonly %__P) {
273; X86-LABEL: test_mm_maskz_expandloadu_epi8:
274; X86:       # %bb.0: # %entry
275; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
276; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
277; X86-NEXT:    vpexpandb (%eax), %xmm0 {%k1} {z}
278; X86-NEXT:    retl
279;
280; X64-LABEL: test_mm_maskz_expandloadu_epi8:
281; X64:       # %bb.0: # %entry
282; X64-NEXT:    kmovd %edi, %k1
283; X64-NEXT:    vpexpandb (%rsi), %xmm0 {%k1} {z}
284; X64-NEXT:    retq
285entry:
286  %0 = bitcast i16 %__U to <16 x i1>
287  %1 = tail call <16 x i8> @llvm.masked.expandload.v16i8(ptr %__P, <16 x i1> %0, <16 x i8> zeroinitializer)
288  %2 = bitcast <16 x i8> %1 to <2 x i64>
289  ret <2 x i64> %2
290}
291
292define <4 x i64> @test_mm256_mask_compress_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
293; X86-LABEL: test_mm256_mask_compress_epi16:
294; X86:       # %bb.0: # %entry
295; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
296; X86-NEXT:    vpcompressw %ymm1, %ymm0 {%k1}
297; X86-NEXT:    retl
298;
299; X64-LABEL: test_mm256_mask_compress_epi16:
300; X64:       # %bb.0: # %entry
301; X64-NEXT:    kmovd %edi, %k1
302; X64-NEXT:    vpcompressw %ymm1, %ymm0 {%k1}
303; X64-NEXT:    retq
304entry:
305  %0 = bitcast <4 x i64> %__D to <16 x i16>
306  %1 = bitcast <4 x i64> %__S to <16 x i16>
307  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
308  %3 = bitcast <16 x i16> %2 to <4 x i64>
309  ret <4 x i64> %3
310}
311
312define <4 x i64> @test_mm256_maskz_compress_epi16(i16 zeroext %__U, <4 x i64> %__D) {
313; X86-LABEL: test_mm256_maskz_compress_epi16:
314; X86:       # %bb.0: # %entry
315; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
316; X86-NEXT:    vpcompressw %ymm0, %ymm0 {%k1} {z}
317; X86-NEXT:    retl
318;
319; X64-LABEL: test_mm256_maskz_compress_epi16:
320; X64:       # %bb.0: # %entry
321; X64-NEXT:    kmovd %edi, %k1
322; X64-NEXT:    vpcompressw %ymm0, %ymm0 {%k1} {z}
323; X64-NEXT:    retq
324entry:
325  %0 = bitcast <4 x i64> %__D to <16 x i16>
326  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
327  %2 = bitcast <16 x i16> %1 to <4 x i64>
328  ret <4 x i64> %2
329}
330
331define <4 x i64> @test_mm256_mask_compress_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
332; X86-LABEL: test_mm256_mask_compress_epi8:
333; X86:       # %bb.0: # %entry
334; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
335; X86-NEXT:    vpcompressb %ymm1, %ymm0 {%k1}
336; X86-NEXT:    retl
337;
338; X64-LABEL: test_mm256_mask_compress_epi8:
339; X64:       # %bb.0: # %entry
340; X64-NEXT:    kmovd %edi, %k1
341; X64-NEXT:    vpcompressb %ymm1, %ymm0 {%k1}
342; X64-NEXT:    retq
343entry:
344  %0 = bitcast <4 x i64> %__D to <32 x i8>
345  %1 = bitcast <4 x i64> %__S to <32 x i8>
346  %2 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
347  %3 = bitcast <32 x i8> %2 to <4 x i64>
348  ret <4 x i64> %3
349}
350
351define <4 x i64> @test_mm256_maskz_compress_epi8(i32 %__U, <4 x i64> %__D) {
352; X86-LABEL: test_mm256_maskz_compress_epi8:
353; X86:       # %bb.0: # %entry
354; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
355; X86-NEXT:    vpcompressb %ymm0, %ymm0 {%k1} {z}
356; X86-NEXT:    retl
357;
358; X64-LABEL: test_mm256_maskz_compress_epi8:
359; X64:       # %bb.0: # %entry
360; X64-NEXT:    kmovd %edi, %k1
361; X64-NEXT:    vpcompressb %ymm0, %ymm0 {%k1} {z}
362; X64-NEXT:    retq
363entry:
364  %0 = bitcast <4 x i64> %__D to <32 x i8>
365  %1 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
366  %2 = bitcast <32 x i8> %1 to <4 x i64>
367  ret <4 x i64> %2
368}
369
370define void @test_mm256_mask_compressstoreu_epi16(ptr %__P, i16 zeroext %__U, <4 x i64> %__D) {
371; X86-LABEL: test_mm256_mask_compressstoreu_epi16:
372; X86:       # %bb.0: # %entry
373; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
374; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
375; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1}
376; X86-NEXT:    vzeroupper
377; X86-NEXT:    retl
378;
379; X64-LABEL: test_mm256_mask_compressstoreu_epi16:
380; X64:       # %bb.0: # %entry
381; X64-NEXT:    kmovd %esi, %k1
382; X64-NEXT:    vpcompressw %ymm0, (%rdi) {%k1}
383; X64-NEXT:    vzeroupper
384; X64-NEXT:    retq
385entry:
386  %0 = bitcast <4 x i64> %__D to <16 x i16>
387  %1 = bitcast i16 %__U to <16 x i1>
388  tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %0, ptr %__P, <16 x i1> %1)
389  ret void
390}
391
392define void @test_mm256_mask_compressstoreu_epi8(ptr %__P, i32 %__U, <4 x i64> %__D) {
393; X86-LABEL: test_mm256_mask_compressstoreu_epi8:
394; X86:       # %bb.0: # %entry
395; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
396; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
397; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1}
398; X86-NEXT:    vzeroupper
399; X86-NEXT:    retl
400;
401; X64-LABEL: test_mm256_mask_compressstoreu_epi8:
402; X64:       # %bb.0: # %entry
403; X64-NEXT:    kmovd %esi, %k1
404; X64-NEXT:    vpcompressb %ymm0, (%rdi) {%k1}
405; X64-NEXT:    vzeroupper
406; X64-NEXT:    retq
407entry:
408  %0 = bitcast <4 x i64> %__D to <32 x i8>
409  %1 = bitcast i32 %__U to <32 x i1>
410  tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %0, ptr %__P, <32 x i1> %1)
411  ret void
412}
413
414define <4 x i64> @test_mm256_mask_expand_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
415; X86-LABEL: test_mm256_mask_expand_epi16:
416; X86:       # %bb.0: # %entry
417; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
418; X86-NEXT:    vpexpandw %ymm1, %ymm0 {%k1}
419; X86-NEXT:    retl
420;
421; X64-LABEL: test_mm256_mask_expand_epi16:
422; X64:       # %bb.0: # %entry
423; X64-NEXT:    kmovd %edi, %k1
424; X64-NEXT:    vpexpandw %ymm1, %ymm0 {%k1}
425; X64-NEXT:    retq
426entry:
427  %0 = bitcast <4 x i64> %__D to <16 x i16>
428  %1 = bitcast <4 x i64> %__S to <16 x i16>
429  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
430  %3 = bitcast <16 x i16> %2 to <4 x i64>
431  ret <4 x i64> %3
432}
433
434define <4 x i64> @test_mm256_maskz_expand_epi16(i16 zeroext %__U, <4 x i64> %__D) {
435; X86-LABEL: test_mm256_maskz_expand_epi16:
436; X86:       # %bb.0: # %entry
437; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
438; X86-NEXT:    vpexpandw %ymm0, %ymm0 {%k1} {z}
439; X86-NEXT:    retl
440;
441; X64-LABEL: test_mm256_maskz_expand_epi16:
442; X64:       # %bb.0: # %entry
443; X64-NEXT:    kmovd %edi, %k1
444; X64-NEXT:    vpexpandw %ymm0, %ymm0 {%k1} {z}
445; X64-NEXT:    retq
446entry:
447  %0 = bitcast <4 x i64> %__D to <16 x i16>
448  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
449  %2 = bitcast <16 x i16> %1 to <4 x i64>
450  ret <4 x i64> %2
451}
452
453define <4 x i64> @test_mm256_mask_expand_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
454; X86-LABEL: test_mm256_mask_expand_epi8:
455; X86:       # %bb.0: # %entry
456; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
457; X86-NEXT:    vpexpandb %ymm1, %ymm0 {%k1}
458; X86-NEXT:    retl
459;
460; X64-LABEL: test_mm256_mask_expand_epi8:
461; X64:       # %bb.0: # %entry
462; X64-NEXT:    kmovd %edi, %k1
463; X64-NEXT:    vpexpandb %ymm1, %ymm0 {%k1}
464; X64-NEXT:    retq
465entry:
466  %0 = bitcast <4 x i64> %__D to <32 x i8>
467  %1 = bitcast <4 x i64> %__S to <32 x i8>
468  %2 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
469  %3 = bitcast <32 x i8> %2 to <4 x i64>
470  ret <4 x i64> %3
471}
472
473define <4 x i64> @test_mm256_maskz_expand_epi8(i32 %__U, <4 x i64> %__D) {
474; X86-LABEL: test_mm256_maskz_expand_epi8:
475; X86:       # %bb.0: # %entry
476; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
477; X86-NEXT:    vpexpandb %ymm0, %ymm0 {%k1} {z}
478; X86-NEXT:    retl
479;
480; X64-LABEL: test_mm256_maskz_expand_epi8:
481; X64:       # %bb.0: # %entry
482; X64-NEXT:    kmovd %edi, %k1
483; X64-NEXT:    vpexpandb %ymm0, %ymm0 {%k1} {z}
484; X64-NEXT:    retq
485entry:
486  %0 = bitcast <4 x i64> %__D to <32 x i8>
487  %1 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
488  %2 = bitcast <32 x i8> %1 to <4 x i64>
489  ret <4 x i64> %2
490}
491
492define <4 x i64> @test_mm256_mask_expandloadu_epi16(<4 x i64> %__S, i16 zeroext %__U, ptr readonly %__P) {
493; X86-LABEL: test_mm256_mask_expandloadu_epi16:
494; X86:       # %bb.0: # %entry
495; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
496; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
497; X86-NEXT:    vpexpandw (%eax), %ymm0 {%k1}
498; X86-NEXT:    retl
499;
500; X64-LABEL: test_mm256_mask_expandloadu_epi16:
501; X64:       # %bb.0: # %entry
502; X64-NEXT:    kmovd %edi, %k1
503; X64-NEXT:    vpexpandw (%rsi), %ymm0 {%k1}
504; X64-NEXT:    retq
505entry:
506  %0 = bitcast <4 x i64> %__S to <16 x i16>
507  %1 = bitcast i16 %__U to <16 x i1>
508  %2 = tail call <16 x i16> @llvm.masked.expandload.v16i16(ptr %__P, <16 x i1> %1, <16 x i16> %0)
509  %3 = bitcast <16 x i16> %2 to <4 x i64>
510  ret <4 x i64> %3
511}
512
513define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, ptr readonly %__P) {
514; X86-LABEL: test_mm256_maskz_expandloadu_epi16:
515; X86:       # %bb.0: # %entry
516; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
517; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
518; X86-NEXT:    vpexpandw (%eax), %ymm0 {%k1} {z}
519; X86-NEXT:    retl
520;
521; X64-LABEL: test_mm256_maskz_expandloadu_epi16:
522; X64:       # %bb.0: # %entry
523; X64-NEXT:    kmovd %edi, %k1
524; X64-NEXT:    vpexpandw (%rsi), %ymm0 {%k1} {z}
525; X64-NEXT:    retq
526entry:
527  %0 = bitcast i16 %__U to <16 x i1>
528  %1 = tail call <16 x i16> @llvm.masked.expandload.v16i16(ptr %__P, <16 x i1> %0, <16 x i16> zeroinitializer)
529  %2 = bitcast <16 x i16> %1 to <4 x i64>
530  ret <4 x i64> %2
531}
532
533define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, ptr readonly %__P) {
534; X86-LABEL: test_mm256_mask_expandloadu_epi8:
535; X86:       # %bb.0: # %entry
536; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
537; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
538; X86-NEXT:    vpexpandb (%eax), %ymm0 {%k1}
539; X86-NEXT:    retl
540;
541; X64-LABEL: test_mm256_mask_expandloadu_epi8:
542; X64:       # %bb.0: # %entry
543; X64-NEXT:    kmovd %edi, %k1
544; X64-NEXT:    vpexpandb (%rsi), %ymm0 {%k1}
545; X64-NEXT:    retq
546entry:
547  %0 = bitcast <4 x i64> %__S to <32 x i8>
548  %1 = bitcast i32 %__U to <32 x i1>
549  %2 = tail call <32 x i8> @llvm.masked.expandload.v32i8(ptr %__P, <32 x i1> %1, <32 x i8> %0)
550  %3 = bitcast <32 x i8> %2 to <4 x i64>
551  ret <4 x i64> %3
552}
553
554define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, ptr readonly %__P) {
555; X86-LABEL: test_mm256_maskz_expandloadu_epi8:
556; X86:       # %bb.0: # %entry
557; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
558; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
559; X86-NEXT:    vpexpandb (%eax), %ymm0 {%k1} {z}
560; X86-NEXT:    retl
561;
562; X64-LABEL: test_mm256_maskz_expandloadu_epi8:
563; X64:       # %bb.0: # %entry
564; X64-NEXT:    kmovd %edi, %k1
565; X64-NEXT:    vpexpandb (%rsi), %ymm0 {%k1} {z}
566; X64-NEXT:    retq
567entry:
568  %0 = bitcast i32 %__U to <32 x i1>
569  %1 = tail call <32 x i8> @llvm.masked.expandload.v32i8(ptr %__P, <32 x i1> %0, <32 x i8> zeroinitializer)
570  %2 = bitcast <32 x i8> %1 to <4 x i64>
571  ret <4 x i64> %2
572}
573
574define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
575; X86-LABEL: test_mm256_mask_shldi_epi64:
576; X86:       # %bb.0: # %entry
577; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
578; X86-NEXT:    kmovd %eax, %k1
579; X86-NEXT:    vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
580; X86-NEXT:    retl
581;
582; X64-LABEL: test_mm256_mask_shldi_epi64:
583; X64:       # %bb.0: # %entry
584; X64-NEXT:    kmovd %edi, %k1
585; X64-NEXT:    vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
586; X64-NEXT:    retq
587entry:
588  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
589  %1 = bitcast i8 %__U to <8 x i1>
590  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
591  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
592  ret <4 x i64> %2
593}
594
595declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
596
597define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
598; X86-LABEL: test_mm256_maskz_shldi_epi64:
599; X86:       # %bb.0: # %entry
600; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
601; X86-NEXT:    kmovd %eax, %k1
602; X86-NEXT:    vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
603; X86-NEXT:    retl
604;
605; X64-LABEL: test_mm256_maskz_shldi_epi64:
606; X64:       # %bb.0: # %entry
607; X64-NEXT:    kmovd %edi, %k1
608; X64-NEXT:    vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
609; X64-NEXT:    retq
610entry:
611  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
612  %1 = bitcast i8 %__U to <8 x i1>
613  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
614  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
615  ret <4 x i64> %2
616}
617
618define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
619; CHECK-LABEL: test_mm256_shldi_epi64:
620; CHECK:       # %bb.0: # %entry
621; CHECK-NEXT:    vpshldq $31, %ymm1, %ymm0, %ymm0
622; CHECK-NEXT:    ret{{[l|q]}}
623entry:
624  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
625  ret <4 x i64> %0
626}
627
628define <2 x i64> @test_mm_mask_shldi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
629; X86-LABEL: test_mm_mask_shldi_epi64:
630; X86:       # %bb.0: # %entry
631; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
632; X86-NEXT:    kmovd %eax, %k1
633; X86-NEXT:    vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
634; X86-NEXT:    retl
635;
636; X64-LABEL: test_mm_mask_shldi_epi64:
637; X64:       # %bb.0: # %entry
638; X64-NEXT:    kmovd %edi, %k1
639; X64-NEXT:    vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
640; X64-NEXT:    retq
641entry:
642  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 47, i64 47>)
643  %1 = bitcast i8 %__U to <8 x i1>
644  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
645  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
646  ret <2 x i64> %2
647}
648
649declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
650
651define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
652; X86-LABEL: test_mm_maskz_shldi_epi64:
653; X86:       # %bb.0: # %entry
654; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
655; X86-NEXT:    kmovd %eax, %k1
656; X86-NEXT:    vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
657; X86-NEXT:    retl
658;
659; X64-LABEL: test_mm_maskz_shldi_epi64:
660; X64:       # %bb.0: # %entry
661; X64-NEXT:    kmovd %edi, %k1
662; X64-NEXT:    vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
663; X64-NEXT:    retq
664entry:
665  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 63, i64 63>)
666  %1 = bitcast i8 %__U to <8 x i1>
667  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
668  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
669  ret <2 x i64> %2
670}
671
672define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
673; CHECK-LABEL: test_mm_shldi_epi64:
674; CHECK:       # %bb.0: # %entry
675; CHECK-NEXT:    vpshldq $31, %xmm1, %xmm0, %xmm0
676; CHECK-NEXT:    ret{{[l|q]}}
677entry:
678  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 31, i64 31>)
679  ret <2 x i64> %0
680}
681
682define <4 x i64> @test_mm256_mask_shldi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
683; X86-LABEL: test_mm256_mask_shldi_epi32:
684; X86:       # %bb.0: # %entry
685; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
686; X86-NEXT:    kmovd %eax, %k1
687; X86-NEXT:    vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
688; X86-NEXT:    retl
689;
690; X64-LABEL: test_mm256_mask_shldi_epi32:
691; X64:       # %bb.0: # %entry
692; X64-NEXT:    kmovd %edi, %k1
693; X64-NEXT:    vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
694; X64-NEXT:    retq
695entry:
696  %0 = bitcast <4 x i64> %__A to <8 x i32>
697  %1 = bitcast <4 x i64> %__B to <8 x i32>
698  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
699  %3 = bitcast <4 x i64> %__S to <8 x i32>
700  %4 = bitcast i8 %__U to <8 x i1>
701  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
702  %6 = bitcast <8 x i32> %5 to <4 x i64>
703  ret <4 x i64> %6
704}
705
706declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
707
708define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
709; X86-LABEL: test_mm256_maskz_shldi_epi32:
710; X86:       # %bb.0: # %entry
711; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
712; X86-NEXT:    kmovd %eax, %k1
713; X86-NEXT:    vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
714; X86-NEXT:    retl
715;
716; X64-LABEL: test_mm256_maskz_shldi_epi32:
717; X64:       # %bb.0: # %entry
718; X64-NEXT:    kmovd %edi, %k1
719; X64-NEXT:    vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
720; X64-NEXT:    retq
721entry:
722  %0 = bitcast <4 x i64> %__A to <8 x i32>
723  %1 = bitcast <4 x i64> %__B to <8 x i32>
724  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
725  %3 = bitcast i8 %__U to <8 x i1>
726  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
727  %5 = bitcast <8 x i32> %4 to <4 x i64>
728  ret <4 x i64> %5
729}
730
731define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
732; CHECK-LABEL: test_mm256_shldi_epi32:
733; CHECK:       # %bb.0: # %entry
734; CHECK-NEXT:    vpshldd $31, %ymm1, %ymm0, %ymm0
735; CHECK-NEXT:    ret{{[l|q]}}
736entry:
737  %0 = bitcast <4 x i64> %__A to <8 x i32>
738  %1 = bitcast <4 x i64> %__B to <8 x i32>
739  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
740  %3 = bitcast <8 x i32> %2 to <4 x i64>
741  ret <4 x i64> %3
742}
743
744define <2 x i64> @test_mm_mask_shldi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
745; X86-LABEL: test_mm_mask_shldi_epi32:
746; X86:       # %bb.0: # %entry
747; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
748; X86-NEXT:    kmovd %eax, %k1
749; X86-NEXT:    vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
750; X86-NEXT:    retl
751;
752; X64-LABEL: test_mm_mask_shldi_epi32:
753; X64:       # %bb.0: # %entry
754; X64-NEXT:    kmovd %edi, %k1
755; X64-NEXT:    vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
756; X64-NEXT:    retq
757entry:
758  %0 = bitcast <2 x i64> %__A to <4 x i32>
759  %1 = bitcast <2 x i64> %__B to <4 x i32>
760  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
761  %3 = bitcast <2 x i64> %__S to <4 x i32>
762  %4 = bitcast i8 %__U to <8 x i1>
763  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
764  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
765  %6 = bitcast <4 x i32> %5 to <2 x i64>
766  ret <2 x i64> %6
767}
768
769declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
770
771define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
772; X86-LABEL: test_mm_maskz_shldi_epi32:
773; X86:       # %bb.0: # %entry
774; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
775; X86-NEXT:    kmovd %eax, %k1
776; X86-NEXT:    vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
777; X86-NEXT:    retl
778;
779; X64-LABEL: test_mm_maskz_shldi_epi32:
780; X64:       # %bb.0: # %entry
781; X64-NEXT:    kmovd %edi, %k1
782; X64-NEXT:    vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
783; X64-NEXT:    retq
784entry:
785  %0 = bitcast <2 x i64> %__A to <4 x i32>
786  %1 = bitcast <2 x i64> %__B to <4 x i32>
787  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
788  %3 = bitcast i8 %__U to <8 x i1>
789  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
790  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
791  %5 = bitcast <4 x i32> %4 to <2 x i64>
792  ret <2 x i64> %5
793}
794
795define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
796; CHECK-LABEL: test_mm_shldi_epi32:
797; CHECK:       # %bb.0: # %entry
798; CHECK-NEXT:    vpshldd $31, %xmm1, %xmm0, %xmm0
799; CHECK-NEXT:    ret{{[l|q]}}
800entry:
801  %0 = bitcast <2 x i64> %__A to <4 x i32>
802  %1 = bitcast <2 x i64> %__B to <4 x i32>
803  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
804  %3 = bitcast <4 x i32> %2 to <2 x i64>
805  ret <2 x i64> %3
806}
807
808define <4 x i64> @test_mm256_mask_shldi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
809; X86-LABEL: test_mm256_mask_shldi_epi16:
810; X86:       # %bb.0: # %entry
811; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
812; X86-NEXT:    vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
813; X86-NEXT:    retl
814;
815; X64-LABEL: test_mm256_mask_shldi_epi16:
816; X64:       # %bb.0: # %entry
817; X64-NEXT:    kmovd %edi, %k1
818; X64-NEXT:    vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
819; X64-NEXT:    retq
820entry:
821  %0 = bitcast <4 x i64> %__A to <16 x i16>
822  %1 = bitcast <4 x i64> %__B to <16 x i16>
823  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
824  %3 = bitcast <4 x i64> %__S to <16 x i16>
825  %4 = bitcast i16 %__U to <16 x i1>
826  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
827  %6 = bitcast <16 x i16> %5 to <4 x i64>
828  ret <4 x i64> %6
829}
830
831declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
832
833define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
834; X86-LABEL: test_mm256_maskz_shldi_epi16:
835; X86:       # %bb.0: # %entry
836; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
837; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
838; X86-NEXT:    retl
839;
840; X64-LABEL: test_mm256_maskz_shldi_epi16:
841; X64:       # %bb.0: # %entry
842; X64-NEXT:    kmovd %edi, %k1
843; X64-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
844; X64-NEXT:    retq
845entry:
846  %0 = bitcast <4 x i64> %__A to <16 x i16>
847  %1 = bitcast <4 x i64> %__B to <16 x i16>
848  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
849  %3 = bitcast i16 %__U to <16 x i1>
850  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
851  %5 = bitcast <16 x i16> %4 to <4 x i64>
852  ret <4 x i64> %5
853}
854
855define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
856; CHECK-LABEL: test_mm256_shldi_epi16:
857; CHECK:       # %bb.0: # %entry
858; CHECK-NEXT:    vpshldw $15, %ymm1, %ymm0, %ymm0
859; CHECK-NEXT:    ret{{[l|q]}}
860entry:
861  %0 = bitcast <4 x i64> %__A to <16 x i16>
862  %1 = bitcast <4 x i64> %__B to <16 x i16>
863  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
864  %3 = bitcast <16 x i16> %2 to <4 x i64>
865  ret <4 x i64> %3
866}
867
868define <2 x i64> @test_mm_mask_shldi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
869; X86-LABEL: test_mm_mask_shldi_epi16:
870; X86:       # %bb.0: # %entry
871; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
872; X86-NEXT:    kmovd %eax, %k1
873; X86-NEXT:    vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
874; X86-NEXT:    retl
875;
876; X64-LABEL: test_mm_mask_shldi_epi16:
877; X64:       # %bb.0: # %entry
878; X64-NEXT:    kmovd %edi, %k1
879; X64-NEXT:    vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
880; X64-NEXT:    retq
881entry:
882  %0 = bitcast <2 x i64> %__A to <8 x i16>
883  %1 = bitcast <2 x i64> %__B to <8 x i16>
884  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
885  %3 = bitcast <2 x i64> %__S to <8 x i16>
886  %4 = bitcast i8 %__U to <8 x i1>
887  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
888  %6 = bitcast <8 x i16> %5 to <2 x i64>
889  ret <2 x i64> %6
890}
891
892declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
893
894define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
895; X86-LABEL: test_mm_maskz_shldi_epi16:
896; X86:       # %bb.0: # %entry
897; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
898; X86-NEXT:    kmovd %eax, %k1
899; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
900; X86-NEXT:    retl
901;
902; X64-LABEL: test_mm_maskz_shldi_epi16:
903; X64:       # %bb.0: # %entry
904; X64-NEXT:    kmovd %edi, %k1
905; X64-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
906; X64-NEXT:    retq
907entry:
908  %0 = bitcast <2 x i64> %__A to <8 x i16>
909  %1 = bitcast <2 x i64> %__B to <8 x i16>
910  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
911  %3 = bitcast i8 %__U to <8 x i1>
912  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
913  %5 = bitcast <8 x i16> %4 to <2 x i64>
914  ret <2 x i64> %5
915}
916
917define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
918; CHECK-LABEL: test_mm_shldi_epi16:
919; CHECK:       # %bb.0: # %entry
920; CHECK-NEXT:    vpshldw $15, %xmm1, %xmm0, %xmm0
921; CHECK-NEXT:    ret{{[l|q]}}
922entry:
923  %0 = bitcast <2 x i64> %__A to <8 x i16>
924  %1 = bitcast <2 x i64> %__B to <8 x i16>
925  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
926  %3 = bitcast <8 x i16> %2 to <2 x i64>
927  ret <2 x i64> %3
928}
929
930define <4 x i64> @test_mm256_mask_shrdi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
931; X86-LABEL: test_mm256_mask_shrdi_epi64:
932; X86:       # %bb.0: # %entry
933; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
934; X86-NEXT:    kmovd %eax, %k1
935; X86-NEXT:    vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
936; X86-NEXT:    retl
937;
938; X64-LABEL: test_mm256_mask_shrdi_epi64:
939; X64:       # %bb.0: # %entry
940; X64-NEXT:    kmovd %edi, %k1
941; X64-NEXT:    vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
942; X64-NEXT:    retq
943entry:
944  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
945  %1 = bitcast i8 %__U to <8 x i1>
946  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
947  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
948  ret <4 x i64> %2
949}
950
951declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
952
953define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
954; X86-LABEL: test_mm256_maskz_shrdi_epi64:
955; X86:       # %bb.0: # %entry
956; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
957; X86-NEXT:    kmovd %eax, %k1
958; X86-NEXT:    vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
959; X86-NEXT:    retl
960;
961; X64-LABEL: test_mm256_maskz_shrdi_epi64:
962; X64:       # %bb.0: # %entry
963; X64-NEXT:    kmovd %edi, %k1
964; X64-NEXT:    vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
965; X64-NEXT:    retq
966entry:
967  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
968  %1 = bitcast i8 %__U to <8 x i1>
969  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
970  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
971  ret <4 x i64> %2
972}
973
974define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
975; CHECK-LABEL: test_mm256_shrdi_epi64:
976; CHECK:       # %bb.0: # %entry
977; CHECK-NEXT:    vpshrdq $31, %ymm1, %ymm0, %ymm0
978; CHECK-NEXT:    ret{{[l|q]}}
979entry:
980  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
981  ret <4 x i64> %0
982}
983
984define <2 x i64> @test_mm_mask_shrdi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
985; X86-LABEL: test_mm_mask_shrdi_epi64:
986; X86:       # %bb.0: # %entry
987; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
988; X86-NEXT:    kmovd %eax, %k1
989; X86-NEXT:    vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
990; X86-NEXT:    retl
991;
992; X64-LABEL: test_mm_mask_shrdi_epi64:
993; X64:       # %bb.0: # %entry
994; X64-NEXT:    kmovd %edi, %k1
995; X64-NEXT:    vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
996; X64-NEXT:    retq
997entry:
998  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 47, i64 47>)
999  %1 = bitcast i8 %__U to <8 x i1>
1000  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1001  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
1002  ret <2 x i64> %2
1003}
1004
1005declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
1006
1007define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1008; X86-LABEL: test_mm_maskz_shrdi_epi64:
1009; X86:       # %bb.0: # %entry
1010; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1011; X86-NEXT:    kmovd %eax, %k1
1012; X86-NEXT:    vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1013; X86-NEXT:    retl
1014;
1015; X64-LABEL: test_mm_maskz_shrdi_epi64:
1016; X64:       # %bb.0: # %entry
1017; X64-NEXT:    kmovd %edi, %k1
1018; X64-NEXT:    vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1019; X64-NEXT:    retq
1020entry:
1021  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 63, i64 63>)
1022  %1 = bitcast i8 %__U to <8 x i1>
1023  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1024  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
1025  ret <2 x i64> %2
1026}
1027
1028define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
1029; CHECK-LABEL: test_mm_shrdi_epi64:
1030; CHECK:       # %bb.0: # %entry
1031; CHECK-NEXT:    vpshrdq $31, %xmm1, %xmm0, %xmm0
1032; CHECK-NEXT:    ret{{[l|q]}}
1033entry:
1034  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 31, i64 31>)
1035  ret <2 x i64> %0
1036}
1037
1038define <4 x i64> @test_mm256_mask_shrdi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1039; X86-LABEL: test_mm256_mask_shrdi_epi32:
1040; X86:       # %bb.0: # %entry
1041; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1042; X86-NEXT:    kmovd %eax, %k1
1043; X86-NEXT:    vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1044; X86-NEXT:    retl
1045;
1046; X64-LABEL: test_mm256_mask_shrdi_epi32:
1047; X64:       # %bb.0: # %entry
1048; X64-NEXT:    kmovd %edi, %k1
1049; X64-NEXT:    vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1050; X64-NEXT:    retq
1051entry:
1052  %0 = bitcast <4 x i64> %__A to <8 x i32>
1053  %1 = bitcast <4 x i64> %__B to <8 x i32>
1054  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
1055  %3 = bitcast <4 x i64> %__S to <8 x i32>
1056  %4 = bitcast i8 %__U to <8 x i1>
1057  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
1058  %6 = bitcast <8 x i32> %5 to <4 x i64>
1059  ret <4 x i64> %6
1060}
1061
1062declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
1063
1064define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1065; X86-LABEL: test_mm256_maskz_shrdi_epi32:
1066; X86:       # %bb.0: # %entry
1067; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1068; X86-NEXT:    kmovd %eax, %k1
1069; X86-NEXT:    vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1070; X86-NEXT:    retl
1071;
1072; X64-LABEL: test_mm256_maskz_shrdi_epi32:
1073; X64:       # %bb.0: # %entry
1074; X64-NEXT:    kmovd %edi, %k1
1075; X64-NEXT:    vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1076; X64-NEXT:    retq
1077entry:
1078  %0 = bitcast <4 x i64> %__A to <8 x i32>
1079  %1 = bitcast <4 x i64> %__B to <8 x i32>
1080  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
1081  %3 = bitcast i8 %__U to <8 x i1>
1082  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1083  %5 = bitcast <8 x i32> %4 to <4 x i64>
1084  ret <4 x i64> %5
1085}
1086
1087define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
1088; CHECK-LABEL: test_mm256_shrdi_epi32:
1089; CHECK:       # %bb.0: # %entry
1090; CHECK-NEXT:    vpshrdd $31, %ymm1, %ymm0, %ymm0
1091; CHECK-NEXT:    ret{{[l|q]}}
1092entry:
1093  %0 = bitcast <4 x i64> %__A to <8 x i32>
1094  %1 = bitcast <4 x i64> %__B to <8 x i32>
1095  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
1096  %3 = bitcast <8 x i32> %2 to <4 x i64>
1097  ret <4 x i64> %3
1098}
1099
1100define <2 x i64> @test_mm_mask_shrdi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1101; X86-LABEL: test_mm_mask_shrdi_epi32:
1102; X86:       # %bb.0: # %entry
1103; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1104; X86-NEXT:    kmovd %eax, %k1
1105; X86-NEXT:    vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1106; X86-NEXT:    retl
1107;
1108; X64-LABEL: test_mm_mask_shrdi_epi32:
1109; X64:       # %bb.0: # %entry
1110; X64-NEXT:    kmovd %edi, %k1
1111; X64-NEXT:    vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1112; X64-NEXT:    retq
1113entry:
1114  %0 = bitcast <2 x i64> %__A to <4 x i32>
1115  %1 = bitcast <2 x i64> %__B to <4 x i32>
1116  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
1117  %3 = bitcast <2 x i64> %__S to <4 x i32>
1118  %4 = bitcast i8 %__U to <8 x i1>
1119  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1120  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
1121  %6 = bitcast <4 x i32> %5 to <2 x i64>
1122  ret <2 x i64> %6
1123}
1124
1125declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
1126
1127define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1128; X86-LABEL: test_mm_maskz_shrdi_epi32:
1129; X86:       # %bb.0: # %entry
1130; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1131; X86-NEXT:    kmovd %eax, %k1
1132; X86-NEXT:    vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1133; X86-NEXT:    retl
1134;
1135; X64-LABEL: test_mm_maskz_shrdi_epi32:
1136; X64:       # %bb.0: # %entry
1137; X64-NEXT:    kmovd %edi, %k1
1138; X64-NEXT:    vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1139; X64-NEXT:    retq
1140entry:
1141  %0 = bitcast <2 x i64> %__A to <4 x i32>
1142  %1 = bitcast <2 x i64> %__B to <4 x i32>
1143  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
1144  %3 = bitcast i8 %__U to <8 x i1>
1145  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1146  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
1147  %5 = bitcast <4 x i32> %4 to <2 x i64>
1148  ret <2 x i64> %5
1149}
1150
1151define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
1152; CHECK-LABEL: test_mm_shrdi_epi32:
1153; CHECK:       # %bb.0: # %entry
1154; CHECK-NEXT:    vpshrdd $31, %xmm1, %xmm0, %xmm0
1155; CHECK-NEXT:    ret{{[l|q]}}
1156entry:
1157  %0 = bitcast <2 x i64> %__A to <4 x i32>
1158  %1 = bitcast <2 x i64> %__B to <4 x i32>
1159  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
1160  %3 = bitcast <4 x i32> %2 to <2 x i64>
1161  ret <2 x i64> %3
1162}
1163
1164define <4 x i64> @test_mm256_mask_shrdi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1165; X86-LABEL: test_mm256_mask_shrdi_epi16:
1166; X86:       # %bb.0: # %entry
1167; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1168; X86-NEXT:    vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1169; X86-NEXT:    retl
1170;
1171; X64-LABEL: test_mm256_mask_shrdi_epi16:
1172; X64:       # %bb.0: # %entry
1173; X64-NEXT:    kmovd %edi, %k1
1174; X64-NEXT:    vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1175; X64-NEXT:    retq
1176entry:
1177  %0 = bitcast <4 x i64> %__A to <16 x i16>
1178  %1 = bitcast <4 x i64> %__B to <16 x i16>
1179  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1180  %3 = bitcast <4 x i64> %__S to <16 x i16>
1181  %4 = bitcast i16 %__U to <16 x i1>
1182  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
1183  %6 = bitcast <16 x i16> %5 to <4 x i64>
1184  ret <4 x i64> %6
1185}
1186
1187declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
1188
1189define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1190; X86-LABEL: test_mm256_maskz_shrdi_epi16:
1191; X86:       # %bb.0: # %entry
1192; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1193; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1194; X86-NEXT:    retl
1195;
1196; X64-LABEL: test_mm256_maskz_shrdi_epi16:
1197; X64:       # %bb.0: # %entry
1198; X64-NEXT:    kmovd %edi, %k1
1199; X64-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1200; X64-NEXT:    retq
1201entry:
1202  %0 = bitcast <4 x i64> %__A to <16 x i16>
1203  %1 = bitcast <4 x i64> %__B to <16 x i16>
1204  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1205  %3 = bitcast i16 %__U to <16 x i1>
1206  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
1207  %5 = bitcast <16 x i16> %4 to <4 x i64>
1208  ret <4 x i64> %5
1209}
1210
1211define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
1212; CHECK-LABEL: test_mm256_shrdi_epi16:
1213; CHECK:       # %bb.0: # %entry
1214; CHECK-NEXT:    vpshrdw $15, %ymm1, %ymm0, %ymm0
1215; CHECK-NEXT:    ret{{[l|q]}}
1216entry:
1217  %0 = bitcast <4 x i64> %__A to <16 x i16>
1218  %1 = bitcast <4 x i64> %__B to <16 x i16>
1219  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1220  %3 = bitcast <16 x i16> %2 to <4 x i64>
1221  ret <4 x i64> %3
1222}
1223
1224define <2 x i64> @test_mm_mask_shrdi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1225; X86-LABEL: test_mm_mask_shrdi_epi16:
1226; X86:       # %bb.0: # %entry
1227; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1228; X86-NEXT:    kmovd %eax, %k1
1229; X86-NEXT:    vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1230; X86-NEXT:    retl
1231;
1232; X64-LABEL: test_mm_mask_shrdi_epi16:
1233; X64:       # %bb.0: # %entry
1234; X64-NEXT:    kmovd %edi, %k1
1235; X64-NEXT:    vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1236; X64-NEXT:    retq
1237entry:
1238  %0 = bitcast <2 x i64> %__A to <8 x i16>
1239  %1 = bitcast <2 x i64> %__B to <8 x i16>
1240  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1241  %3 = bitcast <2 x i64> %__S to <8 x i16>
1242  %4 = bitcast i8 %__U to <8 x i1>
1243  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
1244  %6 = bitcast <8 x i16> %5 to <2 x i64>
1245  ret <2 x i64> %6
1246}
1247
1248declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
1249
1250define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1251; X86-LABEL: test_mm_maskz_shrdi_epi16:
1252; X86:       # %bb.0: # %entry
1253; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1254; X86-NEXT:    kmovd %eax, %k1
1255; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1256; X86-NEXT:    retl
1257;
1258; X64-LABEL: test_mm_maskz_shrdi_epi16:
1259; X64:       # %bb.0: # %entry
1260; X64-NEXT:    kmovd %edi, %k1
1261; X64-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1262; X64-NEXT:    retq
1263entry:
1264  %0 = bitcast <2 x i64> %__A to <8 x i16>
1265  %1 = bitcast <2 x i64> %__B to <8 x i16>
1266  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1267  %3 = bitcast i8 %__U to <8 x i1>
1268  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
1269  %5 = bitcast <8 x i16> %4 to <2 x i64>
1270  ret <2 x i64> %5
1271}
1272
1273define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
1274; CHECK-LABEL: test_mm_shrdi_epi16:
1275; CHECK:       # %bb.0: # %entry
1276; CHECK-NEXT:    vpshrdw $15, %xmm1, %xmm0, %xmm0
1277; CHECK-NEXT:    ret{{[l|q]}}
1278entry:
1279  %0 = bitcast <2 x i64> %__A to <8 x i16>
1280  %1 = bitcast <2 x i64> %__B to <8 x i16>
1281  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1282  %3 = bitcast <8 x i16> %2 to <2 x i64>
1283  ret <2 x i64> %3
1284}
1285
1286define <4 x i64> @test_mm256_mask_shldv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1287; X86-LABEL: test_mm256_mask_shldv_epi64:
1288; X86:       # %bb.0: # %entry
1289; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1290; X86-NEXT:    kmovd %eax, %k1
1291; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1292; X86-NEXT:    retl
1293;
1294; X64-LABEL: test_mm256_mask_shldv_epi64:
1295; X64:       # %bb.0: # %entry
1296; X64-NEXT:    kmovd %edi, %k1
1297; X64-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1298; X64-NEXT:    retq
1299entry:
1300  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1301  %1 = bitcast i8 %__U to <8 x i1>
1302  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1303  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1304  ret <4 x i64> %2
1305}
1306
1307define <4 x i64> @test_mm256_maskz_shldv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1308; X86-LABEL: test_mm256_maskz_shldv_epi64:
1309; X86:       # %bb.0: # %entry
1310; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1311; X86-NEXT:    kmovd %eax, %k1
1312; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1313; X86-NEXT:    retl
1314;
1315; X64-LABEL: test_mm256_maskz_shldv_epi64:
1316; X64:       # %bb.0: # %entry
1317; X64-NEXT:    kmovd %edi, %k1
1318; X64-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1319; X64-NEXT:    retq
1320entry:
1321  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1322  %1 = bitcast i8 %__U to <8 x i1>
1323  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1324  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1325  ret <4 x i64> %2
1326}
1327
1328define <4 x i64> @test_mm256_shldv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1329; CHECK-LABEL: test_mm256_shldv_epi64:
1330; CHECK:       # %bb.0: # %entry
1331; CHECK-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
1332; CHECK-NEXT:    ret{{[l|q]}}
1333entry:
1334  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1335  ret <4 x i64> %0
1336}
1337
1338define <2 x i64> @test_mm_mask_shldv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1339; X86-LABEL: test_mm_mask_shldv_epi64:
1340; X86:       # %bb.0: # %entry
1341; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1342; X86-NEXT:    kmovd %eax, %k1
1343; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1344; X86-NEXT:    retl
1345;
1346; X64-LABEL: test_mm_mask_shldv_epi64:
1347; X64:       # %bb.0: # %entry
1348; X64-NEXT:    kmovd %edi, %k1
1349; X64-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1350; X64-NEXT:    retq
1351entry:
1352  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1353  %1 = bitcast i8 %__U to <8 x i1>
1354  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1355  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1356  ret <2 x i64> %2
1357}
1358
1359define <2 x i64> @test_mm_maskz_shldv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1360; X86-LABEL: test_mm_maskz_shldv_epi64:
1361; X86:       # %bb.0: # %entry
1362; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1363; X86-NEXT:    kmovd %eax, %k1
1364; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1365; X86-NEXT:    retl
1366;
1367; X64-LABEL: test_mm_maskz_shldv_epi64:
1368; X64:       # %bb.0: # %entry
1369; X64-NEXT:    kmovd %edi, %k1
1370; X64-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1371; X64-NEXT:    retq
1372entry:
1373  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1374  %1 = bitcast i8 %__U to <8 x i1>
1375  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1376  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1377  ret <2 x i64> %2
1378}
1379
1380define <2 x i64> @test_mm_shldv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1381; CHECK-LABEL: test_mm_shldv_epi64:
1382; CHECK:       # %bb.0: # %entry
1383; CHECK-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0
1384; CHECK-NEXT:    ret{{[l|q]}}
1385entry:
1386  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1387  ret <2 x i64> %0
1388}
1389
1390define <4 x i64> @test_mm256_mask_shldv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1391; X86-LABEL: test_mm256_mask_shldv_epi32:
1392; X86:       # %bb.0: # %entry
1393; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1394; X86-NEXT:    kmovd %eax, %k1
1395; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1396; X86-NEXT:    retl
1397;
1398; X64-LABEL: test_mm256_mask_shldv_epi32:
1399; X64:       # %bb.0: # %entry
1400; X64-NEXT:    kmovd %edi, %k1
1401; X64-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1402; X64-NEXT:    retq
1403entry:
1404  %0 = bitcast <4 x i64> %__S to <8 x i32>
1405  %1 = bitcast <4 x i64> %__A to <8 x i32>
1406  %2 = bitcast <4 x i64> %__B to <8 x i32>
1407  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1408  %4 = bitcast i8 %__U to <8 x i1>
1409  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1410  %6 = bitcast <8 x i32> %5 to <4 x i64>
1411  ret <4 x i64> %6
1412}
1413
1414define <4 x i64> @test_mm256_maskz_shldv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1415; X86-LABEL: test_mm256_maskz_shldv_epi32:
1416; X86:       # %bb.0: # %entry
1417; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1418; X86-NEXT:    kmovd %eax, %k1
1419; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1420; X86-NEXT:    retl
1421;
1422; X64-LABEL: test_mm256_maskz_shldv_epi32:
1423; X64:       # %bb.0: # %entry
1424; X64-NEXT:    kmovd %edi, %k1
1425; X64-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1426; X64-NEXT:    retq
1427entry:
1428  %0 = bitcast <4 x i64> %__S to <8 x i32>
1429  %1 = bitcast <4 x i64> %__A to <8 x i32>
1430  %2 = bitcast <4 x i64> %__B to <8 x i32>
1431  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1432  %4 = bitcast i8 %__U to <8 x i1>
1433  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1434  %6 = bitcast <8 x i32> %5 to <4 x i64>
1435  ret <4 x i64> %6
1436}
1437
1438define <4 x i64> @test_mm256_shldv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1439; CHECK-LABEL: test_mm256_shldv_epi32:
1440; CHECK:       # %bb.0: # %entry
1441; CHECK-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
1442; CHECK-NEXT:    ret{{[l|q]}}
1443entry:
1444  %0 = bitcast <4 x i64> %__S to <8 x i32>
1445  %1 = bitcast <4 x i64> %__A to <8 x i32>
1446  %2 = bitcast <4 x i64> %__B to <8 x i32>
1447  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1448  %4 = bitcast <8 x i32> %3 to <4 x i64>
1449  ret <4 x i64> %4
1450}
1451
1452define <2 x i64> @test_mm_mask_shldv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1453; X86-LABEL: test_mm_mask_shldv_epi32:
1454; X86:       # %bb.0: # %entry
1455; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1456; X86-NEXT:    kmovd %eax, %k1
1457; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1458; X86-NEXT:    retl
1459;
1460; X64-LABEL: test_mm_mask_shldv_epi32:
1461; X64:       # %bb.0: # %entry
1462; X64-NEXT:    kmovd %edi, %k1
1463; X64-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1464; X64-NEXT:    retq
1465entry:
1466  %0 = bitcast <2 x i64> %__S to <4 x i32>
1467  %1 = bitcast <2 x i64> %__A to <4 x i32>
1468  %2 = bitcast <2 x i64> %__B to <4 x i32>
1469  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1470  %4 = bitcast i8 %__U to <8 x i1>
1471  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1472  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1473  %6 = bitcast <4 x i32> %5 to <2 x i64>
1474  ret <2 x i64> %6
1475}
1476
1477define <2 x i64> @test_mm_maskz_shldv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1478; X86-LABEL: test_mm_maskz_shldv_epi32:
1479; X86:       # %bb.0: # %entry
1480; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1481; X86-NEXT:    kmovd %eax, %k1
1482; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1483; X86-NEXT:    retl
1484;
1485; X64-LABEL: test_mm_maskz_shldv_epi32:
1486; X64:       # %bb.0: # %entry
1487; X64-NEXT:    kmovd %edi, %k1
1488; X64-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1489; X64-NEXT:    retq
1490entry:
1491  %0 = bitcast <2 x i64> %__S to <4 x i32>
1492  %1 = bitcast <2 x i64> %__A to <4 x i32>
1493  %2 = bitcast <2 x i64> %__B to <4 x i32>
1494  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1495  %4 = bitcast i8 %__U to <8 x i1>
1496  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1497  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1498  %6 = bitcast <4 x i32> %5 to <2 x i64>
1499  ret <2 x i64> %6
1500}
1501
1502define <2 x i64> @test_mm_shldv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1503; CHECK-LABEL: test_mm_shldv_epi32:
1504; CHECK:       # %bb.0: # %entry
1505; CHECK-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0
1506; CHECK-NEXT:    ret{{[l|q]}}
1507entry:
1508  %0 = bitcast <2 x i64> %__S to <4 x i32>
1509  %1 = bitcast <2 x i64> %__A to <4 x i32>
1510  %2 = bitcast <2 x i64> %__B to <4 x i32>
1511  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1512  %4 = bitcast <4 x i32> %3 to <2 x i64>
1513  ret <2 x i64> %4
1514}
1515
1516define <4 x i64> @test_mm256_mask_shldv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1517; X86-LABEL: test_mm256_mask_shldv_epi16:
1518; X86:       # %bb.0: # %entry
1519; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1520; X86-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1521; X86-NEXT:    retl
1522;
1523; X64-LABEL: test_mm256_mask_shldv_epi16:
1524; X64:       # %bb.0: # %entry
1525; X64-NEXT:    kmovd %edi, %k1
1526; X64-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1527; X64-NEXT:    retq
1528entry:
1529  %0 = bitcast <4 x i64> %__S to <16 x i16>
1530  %1 = bitcast <4 x i64> %__A to <16 x i16>
1531  %2 = bitcast <4 x i64> %__B to <16 x i16>
1532  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1533  %4 = bitcast i16 %__U to <16 x i1>
1534  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1535  %6 = bitcast <16 x i16> %5 to <4 x i64>
1536  ret <4 x i64> %6
1537}
1538
1539define <4 x i64> @test_mm256_maskz_shldv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1540; X86-LABEL: test_mm256_maskz_shldv_epi16:
1541; X86:       # %bb.0: # %entry
1542; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1543; X86-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1544; X86-NEXT:    retl
1545;
1546; X64-LABEL: test_mm256_maskz_shldv_epi16:
1547; X64:       # %bb.0: # %entry
1548; X64-NEXT:    kmovd %edi, %k1
1549; X64-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1550; X64-NEXT:    retq
1551entry:
1552  %0 = bitcast <4 x i64> %__S to <16 x i16>
1553  %1 = bitcast <4 x i64> %__A to <16 x i16>
1554  %2 = bitcast <4 x i64> %__B to <16 x i16>
1555  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1556  %4 = bitcast i16 %__U to <16 x i1>
1557  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1558  %6 = bitcast <16 x i16> %5 to <4 x i64>
1559  ret <4 x i64> %6
1560}
1561
1562define <4 x i64> @test_mm256_shldv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1563; CHECK-LABEL: test_mm256_shldv_epi16:
1564; CHECK:       # %bb.0: # %entry
1565; CHECK-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
1566; CHECK-NEXT:    ret{{[l|q]}}
1567entry:
1568  %0 = bitcast <4 x i64> %__S to <16 x i16>
1569  %1 = bitcast <4 x i64> %__A to <16 x i16>
1570  %2 = bitcast <4 x i64> %__B to <16 x i16>
1571  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1572  %4 = bitcast <16 x i16> %3 to <4 x i64>
1573  ret <4 x i64> %4
1574}
1575
1576define <2 x i64> @test_mm_mask_shldv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1577; X86-LABEL: test_mm_mask_shldv_epi16:
1578; X86:       # %bb.0: # %entry
1579; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1580; X86-NEXT:    kmovd %eax, %k1
1581; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1582; X86-NEXT:    retl
1583;
1584; X64-LABEL: test_mm_mask_shldv_epi16:
1585; X64:       # %bb.0: # %entry
1586; X64-NEXT:    kmovd %edi, %k1
1587; X64-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1588; X64-NEXT:    retq
1589entry:
1590  %0 = bitcast <2 x i64> %__S to <8 x i16>
1591  %1 = bitcast <2 x i64> %__A to <8 x i16>
1592  %2 = bitcast <2 x i64> %__B to <8 x i16>
1593  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1594  %4 = bitcast i8 %__U to <8 x i1>
1595  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1596  %6 = bitcast <8 x i16> %5 to <2 x i64>
1597  ret <2 x i64> %6
1598}
1599
1600define <2 x i64> @test_mm_maskz_shldv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1601; X86-LABEL: test_mm_maskz_shldv_epi16:
1602; X86:       # %bb.0: # %entry
1603; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1604; X86-NEXT:    kmovd %eax, %k1
1605; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1606; X86-NEXT:    retl
1607;
1608; X64-LABEL: test_mm_maskz_shldv_epi16:
1609; X64:       # %bb.0: # %entry
1610; X64-NEXT:    kmovd %edi, %k1
1611; X64-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1612; X64-NEXT:    retq
1613entry:
1614  %0 = bitcast <2 x i64> %__S to <8 x i16>
1615  %1 = bitcast <2 x i64> %__A to <8 x i16>
1616  %2 = bitcast <2 x i64> %__B to <8 x i16>
1617  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1618  %4 = bitcast i8 %__U to <8 x i1>
1619  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1620  %6 = bitcast <8 x i16> %5 to <2 x i64>
1621  ret <2 x i64> %6
1622}
1623
1624define <2 x i64> @test_mm_shldv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1625; CHECK-LABEL: test_mm_shldv_epi16:
1626; CHECK:       # %bb.0: # %entry
1627; CHECK-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0
1628; CHECK-NEXT:    ret{{[l|q]}}
1629entry:
1630  %0 = bitcast <2 x i64> %__S to <8 x i16>
1631  %1 = bitcast <2 x i64> %__A to <8 x i16>
1632  %2 = bitcast <2 x i64> %__B to <8 x i16>
1633  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1634  %4 = bitcast <8 x i16> %3 to <2 x i64>
1635  ret <2 x i64> %4
1636}
1637
1638define <4 x i64> @test_mm256_mask_shrdv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1639; X86-LABEL: test_mm256_mask_shrdv_epi64:
1640; X86:       # %bb.0: # %entry
1641; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1642; X86-NEXT:    kmovd %eax, %k1
1643; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1644; X86-NEXT:    retl
1645;
1646; X64-LABEL: test_mm256_mask_shrdv_epi64:
1647; X64:       # %bb.0: # %entry
1648; X64-NEXT:    kmovd %edi, %k1
1649; X64-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1650; X64-NEXT:    retq
1651entry:
1652  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1653  %1 = bitcast i8 %__U to <8 x i1>
1654  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1655  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1656  ret <4 x i64> %2
1657}
1658
1659define <4 x i64> @test_mm256_maskz_shrdv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1660; X86-LABEL: test_mm256_maskz_shrdv_epi64:
1661; X86:       # %bb.0: # %entry
1662; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1663; X86-NEXT:    kmovd %eax, %k1
1664; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1665; X86-NEXT:    retl
1666;
1667; X64-LABEL: test_mm256_maskz_shrdv_epi64:
1668; X64:       # %bb.0: # %entry
1669; X64-NEXT:    kmovd %edi, %k1
1670; X64-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1671; X64-NEXT:    retq
1672entry:
1673  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1674  %1 = bitcast i8 %__U to <8 x i1>
1675  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1676  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1677  ret <4 x i64> %2
1678}
1679
1680define <4 x i64> @test_mm256_shrdv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1681; CHECK-LABEL: test_mm256_shrdv_epi64:
1682; CHECK:       # %bb.0: # %entry
1683; CHECK-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0
1684; CHECK-NEXT:    ret{{[l|q]}}
1685entry:
1686  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1687  ret <4 x i64> %0
1688}
1689
1690define <2 x i64> @test_mm_mask_shrdv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1691; X86-LABEL: test_mm_mask_shrdv_epi64:
1692; X86:       # %bb.0: # %entry
1693; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1694; X86-NEXT:    kmovd %eax, %k1
1695; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1696; X86-NEXT:    retl
1697;
1698; X64-LABEL: test_mm_mask_shrdv_epi64:
1699; X64:       # %bb.0: # %entry
1700; X64-NEXT:    kmovd %edi, %k1
1701; X64-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1702; X64-NEXT:    retq
1703entry:
1704  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1705  %1 = bitcast i8 %__U to <8 x i1>
1706  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1707  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1708  ret <2 x i64> %2
1709}
1710
1711define <2 x i64> @test_mm_maskz_shrdv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1712; X86-LABEL: test_mm_maskz_shrdv_epi64:
1713; X86:       # %bb.0: # %entry
1714; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1715; X86-NEXT:    kmovd %eax, %k1
1716; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1717; X86-NEXT:    retl
1718;
1719; X64-LABEL: test_mm_maskz_shrdv_epi64:
1720; X64:       # %bb.0: # %entry
1721; X64-NEXT:    kmovd %edi, %k1
1722; X64-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1723; X64-NEXT:    retq
1724entry:
1725  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1726  %1 = bitcast i8 %__U to <8 x i1>
1727  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1728  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1729  ret <2 x i64> %2
1730}
1731
1732define <2 x i64> @test_mm_shrdv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1733; CHECK-LABEL: test_mm_shrdv_epi64:
1734; CHECK:       # %bb.0: # %entry
1735; CHECK-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0
1736; CHECK-NEXT:    ret{{[l|q]}}
1737entry:
1738  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1739  ret <2 x i64> %0
1740}
1741
1742define <4 x i64> @test_mm256_mask_shrdv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1743; X86-LABEL: test_mm256_mask_shrdv_epi32:
1744; X86:       # %bb.0: # %entry
1745; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1746; X86-NEXT:    kmovd %eax, %k1
1747; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1748; X86-NEXT:    retl
1749;
1750; X64-LABEL: test_mm256_mask_shrdv_epi32:
1751; X64:       # %bb.0: # %entry
1752; X64-NEXT:    kmovd %edi, %k1
1753; X64-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1754; X64-NEXT:    retq
1755entry:
1756  %0 = bitcast <4 x i64> %__S to <8 x i32>
1757  %1 = bitcast <4 x i64> %__A to <8 x i32>
1758  %2 = bitcast <4 x i64> %__B to <8 x i32>
1759  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1760  %4 = bitcast i8 %__U to <8 x i1>
1761  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1762  %6 = bitcast <8 x i32> %5 to <4 x i64>
1763  ret <4 x i64> %6
1764}
1765
1766define <4 x i64> @test_mm256_maskz_shrdv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1767; X86-LABEL: test_mm256_maskz_shrdv_epi32:
1768; X86:       # %bb.0: # %entry
1769; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1770; X86-NEXT:    kmovd %eax, %k1
1771; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1772; X86-NEXT:    retl
1773;
1774; X64-LABEL: test_mm256_maskz_shrdv_epi32:
1775; X64:       # %bb.0: # %entry
1776; X64-NEXT:    kmovd %edi, %k1
1777; X64-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1778; X64-NEXT:    retq
1779entry:
1780  %0 = bitcast <4 x i64> %__S to <8 x i32>
1781  %1 = bitcast <4 x i64> %__A to <8 x i32>
1782  %2 = bitcast <4 x i64> %__B to <8 x i32>
1783  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1784  %4 = bitcast i8 %__U to <8 x i1>
1785  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1786  %6 = bitcast <8 x i32> %5 to <4 x i64>
1787  ret <4 x i64> %6
1788}
1789
1790define <4 x i64> @test_mm256_shrdv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1791; CHECK-LABEL: test_mm256_shrdv_epi32:
1792; CHECK:       # %bb.0: # %entry
1793; CHECK-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0
1794; CHECK-NEXT:    ret{{[l|q]}}
1795entry:
1796  %0 = bitcast <4 x i64> %__S to <8 x i32>
1797  %1 = bitcast <4 x i64> %__A to <8 x i32>
1798  %2 = bitcast <4 x i64> %__B to <8 x i32>
1799  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1800  %4 = bitcast <8 x i32> %3 to <4 x i64>
1801  ret <4 x i64> %4
1802}
1803
1804define <2 x i64> @test_mm_mask_shrdv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1805; X86-LABEL: test_mm_mask_shrdv_epi32:
1806; X86:       # %bb.0: # %entry
1807; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1808; X86-NEXT:    kmovd %eax, %k1
1809; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1810; X86-NEXT:    retl
1811;
1812; X64-LABEL: test_mm_mask_shrdv_epi32:
1813; X64:       # %bb.0: # %entry
1814; X64-NEXT:    kmovd %edi, %k1
1815; X64-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1816; X64-NEXT:    retq
1817entry:
1818  %0 = bitcast <2 x i64> %__S to <4 x i32>
1819  %1 = bitcast <2 x i64> %__A to <4 x i32>
1820  %2 = bitcast <2 x i64> %__B to <4 x i32>
1821  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1822  %4 = bitcast i8 %__U to <8 x i1>
1823  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1824  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1825  %6 = bitcast <4 x i32> %5 to <2 x i64>
1826  ret <2 x i64> %6
1827}
1828
1829define <2 x i64> @test_mm_maskz_shrdv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1830; X86-LABEL: test_mm_maskz_shrdv_epi32:
1831; X86:       # %bb.0: # %entry
1832; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1833; X86-NEXT:    kmovd %eax, %k1
1834; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1835; X86-NEXT:    retl
1836;
1837; X64-LABEL: test_mm_maskz_shrdv_epi32:
1838; X64:       # %bb.0: # %entry
1839; X64-NEXT:    kmovd %edi, %k1
1840; X64-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1841; X64-NEXT:    retq
1842entry:
1843  %0 = bitcast <2 x i64> %__S to <4 x i32>
1844  %1 = bitcast <2 x i64> %__A to <4 x i32>
1845  %2 = bitcast <2 x i64> %__B to <4 x i32>
1846  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1847  %4 = bitcast i8 %__U to <8 x i1>
1848  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1849  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1850  %6 = bitcast <4 x i32> %5 to <2 x i64>
1851  ret <2 x i64> %6
1852}
1853
1854define <2 x i64> @test_mm_shrdv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1855; CHECK-LABEL: test_mm_shrdv_epi32:
1856; CHECK:       # %bb.0: # %entry
1857; CHECK-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0
1858; CHECK-NEXT:    ret{{[l|q]}}
1859entry:
1860  %0 = bitcast <2 x i64> %__S to <4 x i32>
1861  %1 = bitcast <2 x i64> %__A to <4 x i32>
1862  %2 = bitcast <2 x i64> %__B to <4 x i32>
1863  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1864  %4 = bitcast <4 x i32> %3 to <2 x i64>
1865  ret <2 x i64> %4
1866}
1867
1868define <4 x i64> @test_mm256_mask_shrdv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1869; X86-LABEL: test_mm256_mask_shrdv_epi16:
1870; X86:       # %bb.0: # %entry
1871; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1872; X86-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1873; X86-NEXT:    retl
1874;
1875; X64-LABEL: test_mm256_mask_shrdv_epi16:
1876; X64:       # %bb.0: # %entry
1877; X64-NEXT:    kmovd %edi, %k1
1878; X64-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1879; X64-NEXT:    retq
1880entry:
1881  %0 = bitcast <4 x i64> %__S to <16 x i16>
1882  %1 = bitcast <4 x i64> %__A to <16 x i16>
1883  %2 = bitcast <4 x i64> %__B to <16 x i16>
1884  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1885  %4 = bitcast i16 %__U to <16 x i1>
1886  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1887  %6 = bitcast <16 x i16> %5 to <4 x i64>
1888  ret <4 x i64> %6
1889}
1890
1891define <4 x i64> @test_mm256_maskz_shrdv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1892; X86-LABEL: test_mm256_maskz_shrdv_epi16:
1893; X86:       # %bb.0: # %entry
1894; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1895; X86-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1896; X86-NEXT:    retl
1897;
1898; X64-LABEL: test_mm256_maskz_shrdv_epi16:
1899; X64:       # %bb.0: # %entry
1900; X64-NEXT:    kmovd %edi, %k1
1901; X64-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1902; X64-NEXT:    retq
1903entry:
1904  %0 = bitcast <4 x i64> %__S to <16 x i16>
1905  %1 = bitcast <4 x i64> %__A to <16 x i16>
1906  %2 = bitcast <4 x i64> %__B to <16 x i16>
1907  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1908  %4 = bitcast i16 %__U to <16 x i1>
1909  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1910  %6 = bitcast <16 x i16> %5 to <4 x i64>
1911  ret <4 x i64> %6
1912}
1913
1914define <4 x i64> @test_mm256_shrdv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1915; CHECK-LABEL: test_mm256_shrdv_epi16:
1916; CHECK:       # %bb.0: # %entry
1917; CHECK-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0
1918; CHECK-NEXT:    ret{{[l|q]}}
1919entry:
1920  %0 = bitcast <4 x i64> %__S to <16 x i16>
1921  %1 = bitcast <4 x i64> %__A to <16 x i16>
1922  %2 = bitcast <4 x i64> %__B to <16 x i16>
1923  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1924  %4 = bitcast <16 x i16> %3 to <4 x i64>
1925  ret <4 x i64> %4
1926}
1927
1928define <2 x i64> @test_mm_mask_shrdv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1929; X86-LABEL: test_mm_mask_shrdv_epi16:
1930; X86:       # %bb.0: # %entry
1931; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1932; X86-NEXT:    kmovd %eax, %k1
1933; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1934; X86-NEXT:    retl
1935;
1936; X64-LABEL: test_mm_mask_shrdv_epi16:
1937; X64:       # %bb.0: # %entry
1938; X64-NEXT:    kmovd %edi, %k1
1939; X64-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1940; X64-NEXT:    retq
1941entry:
1942  %0 = bitcast <2 x i64> %__S to <8 x i16>
1943  %1 = bitcast <2 x i64> %__A to <8 x i16>
1944  %2 = bitcast <2 x i64> %__B to <8 x i16>
1945  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1946  %4 = bitcast i8 %__U to <8 x i1>
1947  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1948  %6 = bitcast <8 x i16> %5 to <2 x i64>
1949  ret <2 x i64> %6
1950}
1951
1952define <2 x i64> @test_mm_maskz_shrdv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1953; X86-LABEL: test_mm_maskz_shrdv_epi16:
1954; X86:       # %bb.0: # %entry
1955; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1956; X86-NEXT:    kmovd %eax, %k1
1957; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1958; X86-NEXT:    retl
1959;
1960; X64-LABEL: test_mm_maskz_shrdv_epi16:
1961; X64:       # %bb.0: # %entry
1962; X64-NEXT:    kmovd %edi, %k1
1963; X64-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1964; X64-NEXT:    retq
1965entry:
1966  %0 = bitcast <2 x i64> %__S to <8 x i16>
1967  %1 = bitcast <2 x i64> %__A to <8 x i16>
1968  %2 = bitcast <2 x i64> %__B to <8 x i16>
1969  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1970  %4 = bitcast i8 %__U to <8 x i1>
1971  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1972  %6 = bitcast <8 x i16> %5 to <2 x i64>
1973  ret <2 x i64> %6
1974}
1975
1976define <2 x i64> @test_mm_shrdv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1977; CHECK-LABEL: test_mm_shrdv_epi16:
1978; CHECK:       # %bb.0: # %entry
1979; CHECK-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0
1980; CHECK-NEXT:    ret{{[l|q]}}
1981entry:
1982  %0 = bitcast <2 x i64> %__S to <8 x i16>
1983  %1 = bitcast <2 x i64> %__A to <8 x i16>
1984  %2 = bitcast <2 x i64> %__B to <8 x i16>
1985  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1986  %4 = bitcast <8 x i16> %3 to <2 x i64>
1987  ret <2 x i64> %4
1988}
1989
1990declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
1991declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
1992declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
1993declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
1994declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
1995declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
1996declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
1997declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
1998declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
1999declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
2000declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
2001declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
2002declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
2003declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
2004declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
2005declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
2006