xref: /llvm-project/llvm/test/CodeGen/X86/avx2-vector-shifts.ll (revision 69ffa7be3bda5547d7a41233f86b88539616e386)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-ALL
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-PERLANE
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-ALL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-PERLANE
8
9; AVX2 Logical Shift Left
10
11define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
12; CHECK-LABEL: test_sllw_1:
13; CHECK:       # %bb.0: # %entry
14; CHECK-NEXT:    ret{{[l|q]}}
15entry:
16  %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
17  ret <16 x i16> %shl
18}
19
20define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
21; CHECK-LABEL: test_sllw_2:
22; CHECK:       # %bb.0: # %entry
23; CHECK-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
24; CHECK-NEXT:    ret{{[l|q]}}
25entry:
26  %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
27  ret <16 x i16> %shl
28}
29
30define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
31; CHECK-LABEL: test_sllw_3:
32; CHECK:       # %bb.0: # %entry
33; CHECK-NEXT:    vpsllw $15, %ymm0, %ymm0
34; CHECK-NEXT:    ret{{[l|q]}}
35entry:
36  %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
37  ret <16 x i16> %shl
38}
39
40define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
41; CHECK-LABEL: test_slld_1:
42; CHECK:       # %bb.0: # %entry
43; CHECK-NEXT:    ret{{[l|q]}}
44entry:
45  %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
46  ret <8 x i32> %shl
47}
48
49define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
50; CHECK-LABEL: test_slld_2:
51; CHECK:       # %bb.0: # %entry
52; CHECK-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
53; CHECK-NEXT:    ret{{[l|q]}}
54entry:
55  %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
56  ret <8 x i32> %shl
57}
58
59define <8 x i32> @test_vpslld_var(i32 %shift) {
60; X86-LABEL: test_vpslld_var:
61; X86:       # %bb.0:
62; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
63; X86-NEXT:    vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
64; X86-NEXT:    vpslld %xmm0, %ymm1, %ymm0
65; X86-NEXT:    retl
66;
67; X64-LABEL: test_vpslld_var:
68; X64:       # %bb.0:
69; X64-NEXT:    vmovd %edi, %xmm0
70; X64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
71; X64-NEXT:    vpslld %xmm0, %ymm1, %ymm0
72; X64-NEXT:    retq
73  %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
74  %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
75  ret <8 x i32> %tmp
76}
77
78define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
79; CHECK-LABEL: test_slld_3:
80; CHECK:       # %bb.0: # %entry
81; CHECK-NEXT:    vpslld $31, %ymm0, %ymm0
82; CHECK-NEXT:    ret{{[l|q]}}
83entry:
84  %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
85  ret <8 x i32> %shl
86}
87
88define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
89; CHECK-LABEL: test_sllq_1:
90; CHECK:       # %bb.0: # %entry
91; CHECK-NEXT:    ret{{[l|q]}}
92entry:
93  %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
94  ret <4 x i64> %shl
95}
96
97define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
98; CHECK-LABEL: test_sllq_2:
99; CHECK:       # %bb.0: # %entry
100; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
101; CHECK-NEXT:    ret{{[l|q]}}
102entry:
103  %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
104  ret <4 x i64> %shl
105}
106
107define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
108; CHECK-LABEL: test_sllq_3:
109; CHECK:       # %bb.0: # %entry
110; CHECK-NEXT:    vpsllq $63, %ymm0, %ymm0
111; CHECK-NEXT:    ret{{[l|q]}}
112entry:
113  %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
114  ret <4 x i64> %shl
115}
116
117; AVX2 Arithmetic Shift
118
119define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
120; CHECK-LABEL: test_sraw_1:
121; CHECK:       # %bb.0: # %entry
122; CHECK-NEXT:    ret{{[l|q]}}
123entry:
124  %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
125  ret <16 x i16> %shl
126}
127
128define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
129; CHECK-LABEL: test_sraw_2:
130; CHECK:       # %bb.0: # %entry
131; CHECK-NEXT:    vpsraw $1, %ymm0, %ymm0
132; CHECK-NEXT:    ret{{[l|q]}}
133entry:
134  %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
135  ret <16 x i16> %shl
136}
137
138define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
139; CHECK-LABEL: test_sraw_3:
140; CHECK:       # %bb.0: # %entry
141; CHECK-NEXT:    vpsraw $15, %ymm0, %ymm0
142; CHECK-NEXT:    ret{{[l|q]}}
143entry:
144  %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
145  ret <16 x i16> %shl
146}
147
148define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
149; CHECK-LABEL: test_srad_1:
150; CHECK:       # %bb.0: # %entry
151; CHECK-NEXT:    ret{{[l|q]}}
152entry:
153  %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
154  ret <8 x i32> %shl
155}
156
157define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
158; CHECK-LABEL: test_srad_2:
159; CHECK:       # %bb.0: # %entry
160; CHECK-NEXT:    vpsrad $1, %ymm0, %ymm0
161; CHECK-NEXT:    ret{{[l|q]}}
162entry:
163  %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
164  ret <8 x i32> %shl
165}
166
167define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
168; CHECK-LABEL: test_srad_3:
169; CHECK:       # %bb.0: # %entry
170; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm0
171; CHECK-NEXT:    ret{{[l|q]}}
172entry:
173  %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
174  ret <8 x i32> %shl
175}
176
177; SSE Logical Shift Right
178
179define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
180; CHECK-LABEL: test_srlw_1:
181; CHECK:       # %bb.0: # %entry
182; CHECK-NEXT:    ret{{[l|q]}}
183entry:
184  %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
185  ret <16 x i16> %shl
186}
187
188define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
189; CHECK-LABEL: test_srlw_2:
190; CHECK:       # %bb.0: # %entry
191; CHECK-NEXT:    vpsrlw $1, %ymm0, %ymm0
192; CHECK-NEXT:    ret{{[l|q]}}
193entry:
194  %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
195  ret <16 x i16> %shl
196}
197
198define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
199; CHECK-LABEL: test_srlw_3:
200; CHECK:       # %bb.0: # %entry
201; CHECK-NEXT:    vpsrlw $15, %ymm0, %ymm0
202; CHECK-NEXT:    ret{{[l|q]}}
203entry:
204  %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
205  ret <16 x i16> %shl
206}
207
208define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
209; CHECK-LABEL: test_srld_1:
210; CHECK:       # %bb.0: # %entry
211; CHECK-NEXT:    ret{{[l|q]}}
212entry:
213  %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
214  ret <8 x i32> %shl
215}
216
217define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
218; CHECK-LABEL: test_srld_2:
219; CHECK:       # %bb.0: # %entry
220; CHECK-NEXT:    vpsrld $1, %ymm0, %ymm0
221; CHECK-NEXT:    ret{{[l|q]}}
222entry:
223  %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
224  ret <8 x i32> %shl
225}
226
227define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
228; CHECK-LABEL: test_srld_3:
229; CHECK:       # %bb.0: # %entry
230; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
231; CHECK-NEXT:    ret{{[l|q]}}
232entry:
233  %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
234  ret <8 x i32> %shl
235}
236
237define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
238; CHECK-LABEL: test_srlq_1:
239; CHECK:       # %bb.0: # %entry
240; CHECK-NEXT:    ret{{[l|q]}}
241entry:
242  %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
243  ret <4 x i64> %shl
244}
245
246define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
247; CHECK-LABEL: test_srlq_2:
248; CHECK:       # %bb.0: # %entry
249; CHECK-NEXT:    vpsrlq $1, %ymm0, %ymm0
250; CHECK-NEXT:    ret{{[l|q]}}
251entry:
252  %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
253  ret <4 x i64> %shl
254}
255
256define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
257; CHECK-LABEL: test_srlq_3:
258; CHECK:       # %bb.0: # %entry
259; CHECK-NEXT:    vpsrlq $63, %ymm0, %ymm0
260; CHECK-NEXT:    ret{{[l|q]}}
261entry:
262  %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
263  ret <4 x i64> %shl
264}
265
266define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
267; X86-SLOW-LABEL: srl_trunc_and_v4i64:
268; X86-SLOW:       # %bb.0:
269; X86-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
270; X86-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
271; X86-SLOW-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
272; X86-SLOW-NEXT:    vandps %xmm2, %xmm1, %xmm1
273; X86-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
274; X86-SLOW-NEXT:    vzeroupper
275; X86-SLOW-NEXT:    retl
276;
277; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
278; X86-FAST-ALL:       # %bb.0:
279; X86-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
280; X86-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
281; X86-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
282; X86-FAST-ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
283; X86-FAST-ALL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
284; X86-FAST-ALL-NEXT:    vzeroupper
285; X86-FAST-ALL-NEXT:    retl
286;
287; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
288; X86-FAST-PERLANE:       # %bb.0:
289; X86-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
290; X86-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
291; X86-FAST-PERLANE-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
292; X86-FAST-PERLANE-NEXT:    vandps %xmm2, %xmm1, %xmm1
293; X86-FAST-PERLANE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
294; X86-FAST-PERLANE-NEXT:    vzeroupper
295; X86-FAST-PERLANE-NEXT:    retl
296;
297; X64-SLOW-LABEL: srl_trunc_and_v4i64:
298; X64-SLOW:       # %bb.0:
299; X64-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
300; X64-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
301; X64-SLOW-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
302; X64-SLOW-NEXT:    vandps %xmm2, %xmm1, %xmm1
303; X64-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
304; X64-SLOW-NEXT:    vzeroupper
305; X64-SLOW-NEXT:    retq
306;
307; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
308; X64-FAST-ALL:       # %bb.0:
309; X64-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
310; X64-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
311; X64-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
312; X64-FAST-ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
313; X64-FAST-ALL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
314; X64-FAST-ALL-NEXT:    vzeroupper
315; X64-FAST-ALL-NEXT:    retq
316;
317; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
318; X64-FAST-PERLANE:       # %bb.0:
319; X64-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
320; X64-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
321; X64-FAST-PERLANE-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
322; X64-FAST-PERLANE-NEXT:    vandps %xmm2, %xmm1, %xmm1
323; X64-FAST-PERLANE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
324; X64-FAST-PERLANE-NEXT:    vzeroupper
325; X64-FAST-PERLANE-NEXT:    retq
326  %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
327  %trunc = trunc <4 x i64> %and to <4 x i32>
328  %sra = lshr <4 x i32> %x, %trunc
329  ret <4 x i32> %sra
330}
331
332;
333; Vectorized byte shifts
334;
335
336define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
337; CHECK-LABEL: shl_8i16:
338; CHECK:       # %bb.0:
339; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
340; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
341; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
342; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
343; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
344; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
345; CHECK-NEXT:    vzeroupper
346; CHECK-NEXT:    ret{{[l|q]}}
347  %shl = shl <8 x i16> %r, %a
348  ret <8 x i16> %shl
349}
350
351define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
352; CHECK-LABEL: shl_16i16:
353; CHECK:       # %bb.0:
354; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
355; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
356; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
357; CHECK-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
358; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
359; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
360; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
361; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
362; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
363; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
364; CHECK-NEXT:    ret{{[l|q]}}
365  %shl = shl <16 x i16> %r, %a
366  ret <16 x i16> %shl
367}
368
369define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
370; X86-LABEL: shl_32i8:
371; X86:       # %bb.0:
372; X86-NEXT:    vpsllw $5, %ymm1, %ymm1
373; X86-NEXT:    vpsllw $4, %ymm0, %ymm2
374; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
375; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
376; X86-NEXT:    vpsllw $2, %ymm0, %ymm2
377; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
378; X86-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
379; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
380; X86-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
381; X86-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
382; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
383; X86-NEXT:    retl
384;
385; X64-LABEL: shl_32i8:
386; X64:       # %bb.0:
387; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
388; X64-NEXT:    vpsllw $4, %ymm0, %ymm2
389; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
390; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391; X64-NEXT:    vpsllw $2, %ymm0, %ymm2
392; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
393; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
394; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
395; X64-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
396; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
397; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
398; X64-NEXT:    retq
399  %shl = shl <32 x i8> %r, %a
400  ret <32 x i8> %shl
401}
402
403define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
404; CHECK-LABEL: ashr_8i16:
405; CHECK:       # %bb.0:
406; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
407; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
408; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
409; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
410; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
411; CHECK-NEXT:    vzeroupper
412; CHECK-NEXT:    ret{{[l|q]}}
413  %ashr = ashr <8 x i16> %r, %a
414  ret <8 x i16> %ashr
415}
416
417define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
418; CHECK-LABEL: ashr_16i16:
419; CHECK:       # %bb.0:
420; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
421; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
422; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
423; CHECK-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
424; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
425; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
426; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
427; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
428; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
429; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
430; CHECK-NEXT:    ret{{[l|q]}}
431  %ashr = ashr <16 x i16> %r, %a
432  ret <16 x i16> %ashr
433}
434
435define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
436; CHECK-LABEL: ashr_32i8:
437; CHECK:       # %bb.0:
438; CHECK-NEXT:    vpsllw $5, %ymm1, %ymm1
439; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
440; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
441; CHECK-NEXT:    vpsraw $4, %ymm3, %ymm4
442; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
443; CHECK-NEXT:    vpsraw $2, %ymm3, %ymm4
444; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
445; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
446; CHECK-NEXT:    vpsraw $1, %ymm3, %ymm4
447; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
448; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
449; CHECK-NEXT:    vpsrlw $8, %ymm2, %ymm2
450; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
451; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
452; CHECK-NEXT:    vpsraw $4, %ymm0, %ymm3
453; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
454; CHECK-NEXT:    vpsraw $2, %ymm0, %ymm3
455; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
456; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
457; CHECK-NEXT:    vpsraw $1, %ymm0, %ymm3
458; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
459; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
460; CHECK-NEXT:    vpsrlw $8, %ymm0, %ymm0
461; CHECK-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
462; CHECK-NEXT:    ret{{[l|q]}}
463  %ashr = ashr <32 x i8> %r, %a
464  ret <32 x i8> %ashr
465}
466
467define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
468; CHECK-LABEL: lshr_8i16:
469; CHECK:       # %bb.0:
470; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
471; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
472; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
473; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
474; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
475; CHECK-NEXT:    vzeroupper
476; CHECK-NEXT:    ret{{[l|q]}}
477  %lshr = lshr <8 x i16> %r, %a
478  ret <8 x i16> %lshr
479}
480
481define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
482; CHECK-LABEL: lshr_16i16:
483; CHECK:       # %bb.0:
484; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
485; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
486; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
487; CHECK-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
488; CHECK-NEXT:    vpsrld $16, %ymm3, %ymm3
489; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
490; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
491; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
492; CHECK-NEXT:    vpsrld $16, %ymm0, %ymm0
493; CHECK-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
494; CHECK-NEXT:    ret{{[l|q]}}
495  %lshr = lshr <16 x i16> %r, %a
496  ret <16 x i16> %lshr
497}
498
499define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
500; X86-LABEL: lshr_32i8:
501; X86:       # %bb.0:
502; X86-NEXT:    vpsllw $5, %ymm1, %ymm1
503; X86-NEXT:    vpsrlw $4, %ymm0, %ymm2
504; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
505; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
506; X86-NEXT:    vpsrlw $2, %ymm0, %ymm2
507; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
508; X86-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
509; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
510; X86-NEXT:    vpsrlw $1, %ymm0, %ymm2
511; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
512; X86-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
513; X86-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
514; X86-NEXT:    retl
515;
516; X64-LABEL: lshr_32i8:
517; X64:       # %bb.0:
518; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
519; X64-NEXT:    vpsrlw $4, %ymm0, %ymm2
520; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
521; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
522; X64-NEXT:    vpsrlw $2, %ymm0, %ymm2
523; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
524; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
525; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
526; X64-NEXT:    vpsrlw $1, %ymm0, %ymm2
527; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
528; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
529; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
530; X64-NEXT:    retq
531  %lshr = lshr <32 x i8> %r, %a
532  ret <32 x i8> %lshr
533}
534