xref: /llvm-project/llvm/test/CodeGen/X86/vec_shift6.ll (revision 49fd2dde21655f95309abb17ad1d3392afe4985f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
6
7; Verify that we don't scalarize a packed vector shift left of 16-bit
8; signed integers if the amount is a constant build_vector.
9; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
10
11define <8 x i16> @test1(<8 x i16> %a) {
12; SSE-LABEL: test1:
13; SSE:       # %bb.0:
14; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,4,8,128,1,512,2048]
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: test1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,2,4,8,128,1,512,2048]
20; AVX-NEXT:    retq
21  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
22  ret <8 x i16> %shl
23}
24
25; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw())
26
27define <8 x i16> @test2(<8 x i16> %a) {
28; SSE2-LABEL: test2:
29; SSE2:       # %bb.0:
30; SSE2-NEXT:    movdqa %xmm0, %xmm1
31; SSE2-NEXT:    paddw %xmm0, %xmm1
32; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
33; SSE2-NEXT:    retq
34;
35; SSE41-LABEL: test2:
36; SSE41:       # %bb.0:
37; SSE41-NEXT:    movdqa %xmm0, %xmm1
38; SSE41-NEXT:    paddw %xmm0, %xmm1
39; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
40; SSE41-NEXT:    retq
41;
42; AVX-LABEL: test2:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm1
45; AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
46; AVX-NEXT:    retq
47  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
48  ret <8 x i16> %shl
49}
50
51; Verify that a vector shift left of 32-bit signed integers is simply expanded
52; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
53; counts is a constant build_vector.
54
55define <4 x i32> @test3(<4 x i32> %a) {
56; SSE2-LABEL: test3:
57; SSE2:       # %bb.0:
58; SSE2-NEXT:    movdqa %xmm0, %xmm1
59; SSE2-NEXT:    paddd %xmm0, %xmm1
60; SSE2-NEXT:    pslld $2, %xmm0
61; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
62; SSE2-NEXT:    retq
63;
64; SSE41-LABEL: test3:
65; SSE41:       # %bb.0:
66; SSE41-NEXT:    movdqa %xmm0, %xmm1
67; SSE41-NEXT:    pslld $2, %xmm1
68; SSE41-NEXT:    paddd %xmm0, %xmm0
69; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
70; SSE41-NEXT:    retq
71;
72; AVX-LABEL: test3:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
75; AVX-NEXT:    retq
76  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
77  ret <4 x i32> %shl
78}
79
80define <4 x i32> @test4(<4 x i32> %a) {
81; SSE2-LABEL: test4:
82; SSE2:       # %bb.0:
83; SSE2-NEXT:    movdqa %xmm0, %xmm1
84; SSE2-NEXT:    paddd %xmm0, %xmm1
85; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
86; SSE2-NEXT:    retq
87;
88; SSE41-LABEL: test4:
89; SSE41:       # %bb.0:
90; SSE41-NEXT:    movdqa %xmm0, %xmm1
91; SSE41-NEXT:    paddd %xmm0, %xmm1
92; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
93; SSE41-NEXT:    retq
94;
95; AVX-LABEL: test4:
96; AVX:       # %bb.0:
97; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
98; AVX-NEXT:    retq
99  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
100  ret <4 x i32> %shl
101}
102
103; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
104; into two pmullw instructions. With AVX2, the test case below would produce
105; a single vpmullw.
106
107define <16 x i16> @test5(<16 x i16> %a) {
108; SSE-LABEL: test5:
109; SSE:       # %bb.0:
110; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
111; SSE-NEXT:    pmullw %xmm2, %xmm0
112; SSE-NEXT:    pmullw %xmm2, %xmm1
113; SSE-NEXT:    retq
114;
115; AVX-LABEL: test5:
116; AVX:       # %bb.0:
117; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
118; AVX-NEXT:    retq
119  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
120  ret <16 x i16> %shl
121}
122
123; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
124; into two pmulld instructions. With AVX2, the test case below would produce
125; a single vpsllvd instead.
126
127define <8 x i32> @test6(<8 x i32> %a) {
128; SSE2-LABEL: test6:
129; SSE2:       # %bb.0:
130; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
131; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
132; SSE2-NEXT:    pmuludq %xmm2, %xmm0
133; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
134; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,8,8]
135; SSE2-NEXT:    pmuludq %xmm4, %xmm3
136; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
137; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
138; SSE2-NEXT:    pmuludq %xmm1, %xmm2
139; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
140; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
141; SSE2-NEXT:    pmuludq %xmm4, %xmm1
142; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
143; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
144; SSE2-NEXT:    movdqa %xmm2, %xmm1
145; SSE2-NEXT:    retq
146;
147; SSE41-LABEL: test6:
148; SSE41:       # %bb.0:
149; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [2,2,4,8]
150; SSE41-NEXT:    pmulld %xmm2, %xmm0
151; SSE41-NEXT:    pmulld %xmm2, %xmm1
152; SSE41-NEXT:    retq
153;
154; AVX-LABEL: test6:
155; AVX:       # %bb.0:
156; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
157; AVX-NEXT:    retq
158  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
159  ret <8 x i32> %shl
160}
161
162; With AVX2 and AVX512, the test case below should produce a sequence of
163; two vpmullw instructions. On SSE2 instead, we split the shift in four
164; parts and then we convert each part into a pmullw.
165
166define <32 x i16> @test7(<32 x i16> %a) {
167; SSE-LABEL: test7:
168; SSE:       # %bb.0:
169; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
170; SSE-NEXT:    pmullw %xmm4, %xmm0
171; SSE-NEXT:    pmullw %xmm4, %xmm1
172; SSE-NEXT:    pmullw %xmm4, %xmm2
173; SSE-NEXT:    pmullw %xmm4, %xmm3
174; SSE-NEXT:    retq
175;
176; AVX2-LABEL: test7:
177; AVX2:       # %bb.0:
178; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
179; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
180; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
181; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
182; AVX2-NEXT:    retq
183;
184; AVX512-LABEL: test7:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
187; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
188; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
189; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
190; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
191; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
192; AVX512-NEXT:    retq
193  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
194  ret <32 x i16> %shl
195}
196
197; Similar to test7; the difference is that with AVX512 support
198; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
199
200define <16 x i32> @test8(<16 x i32> %a) {
201; SSE2-LABEL: test8:
202; SSE2:       # %bb.0:
203; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
204; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
205; SSE2-NEXT:    pmuludq %xmm4, %xmm0
206; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
207; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2,2,8,8]
208; SSE2-NEXT:    pmuludq %xmm6, %xmm5
209; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
210; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
211; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
212; SSE2-NEXT:    pmuludq %xmm4, %xmm1
213; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
214; SSE2-NEXT:    pmuludq %xmm6, %xmm5
215; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
216; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
217; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
218; SSE2-NEXT:    pmuludq %xmm4, %xmm2
219; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
220; SSE2-NEXT:    pmuludq %xmm6, %xmm5
221; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
222; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
223; SSE2-NEXT:    pmuludq %xmm3, %xmm4
224; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
225; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
226; SSE2-NEXT:    pmuludq %xmm6, %xmm3
227; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
228; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
229; SSE2-NEXT:    movdqa %xmm4, %xmm3
230; SSE2-NEXT:    retq
231;
232; SSE41-LABEL: test8:
233; SSE41:       # %bb.0:
234; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm4 = [2,2,4,8]
235; SSE41-NEXT:    pmulld %xmm4, %xmm0
236; SSE41-NEXT:    pmulld %xmm4, %xmm1
237; SSE41-NEXT:    pmulld %xmm4, %xmm2
238; SSE41-NEXT:    pmulld %xmm4, %xmm3
239; SSE41-NEXT:    retq
240;
241; AVX2-LABEL: test8:
242; AVX2:       # %bb.0:
243; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
244; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
245; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
246; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
247; AVX2-NEXT:    retq
248;
249; AVX512-LABEL: test8:
250; AVX512:       # %bb.0:
251; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
252; AVX512-NEXT:    retq
253  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
254  ret <16 x i32> %shl
255}
256
257; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
258
259define <8 x i64> @test9(<8 x i64> %a) {
260; SSE2-LABEL: test9:
261; SSE2:       # %bb.0:
262; SSE2-NEXT:    movdqa %xmm1, %xmm4
263; SSE2-NEXT:    psllq $2, %xmm4
264; SSE2-NEXT:    psllq $3, %xmm1
265; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
266; SSE2-NEXT:    movdqa %xmm3, %xmm4
267; SSE2-NEXT:    psllq $2, %xmm4
268; SSE2-NEXT:    psllq $3, %xmm3
269; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
270; SSE2-NEXT:    paddq %xmm0, %xmm0
271; SSE2-NEXT:    paddq %xmm2, %xmm2
272; SSE2-NEXT:    retq
273;
274; SSE41-LABEL: test9:
275; SSE41:       # %bb.0:
276; SSE41-NEXT:    movdqa %xmm1, %xmm4
277; SSE41-NEXT:    psllq $3, %xmm4
278; SSE41-NEXT:    psllq $2, %xmm1
279; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
280; SSE41-NEXT:    movdqa %xmm3, %xmm4
281; SSE41-NEXT:    psllq $3, %xmm4
282; SSE41-NEXT:    psllq $2, %xmm3
283; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
284; SSE41-NEXT:    paddq %xmm0, %xmm0
285; SSE41-NEXT:    paddq %xmm2, %xmm2
286; SSE41-NEXT:    retq
287;
288; AVX2-LABEL: test9:
289; AVX2:       # %bb.0:
290; AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3]
291; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
292; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
293; AVX2-NEXT:    retq
294;
295; AVX512-LABEL: test9:
296; AVX512:       # %bb.0:
297; AVX512-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
298; AVX512-NEXT:    retq
299  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
300  ret <8 x i64> %shl
301}
302