xref: /llvm-project/llvm/test/CodeGen/X86/combine-srl.ll (revision b320d3733dfb76c1b7d78fc499490d34b99e2284)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
8
9; fold (srl 0, x) -> 0
10define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
11; SSE-LABEL: combine_vec_lshr_zero:
12; SSE:       # %bb.0:
13; SSE-NEXT:    xorps %xmm0, %xmm0
14; SSE-NEXT:    retq
15;
16; AVX-LABEL: combine_vec_lshr_zero:
17; AVX:       # %bb.0:
18; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
19; AVX-NEXT:    retq
20  %1 = lshr <4 x i32> zeroinitializer, %x
21  ret <4 x i32> %1
22}
23
24; fold (srl x, c >= size(x)) -> undef
25define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) {
26; CHECK-LABEL: combine_vec_lshr_outofrange0:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    retq
29  %1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
30  ret <4 x i32> %1
31}
32
33define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
34; CHECK-LABEL: combine_vec_lshr_outofrange1:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    retq
37  %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
38  ret <4 x i32> %1
39}
40
41define <4 x i32> @combine_vec_lshr_outofrange2(<4 x i32> %x) {
42; CHECK-LABEL: combine_vec_lshr_outofrange2:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    retq
45  %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
46  ret <4 x i32> %1
47}
48
49; fold (srl x, 0) -> x
50define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
51; CHECK-LABEL: combine_vec_lshr_by_zero:
52; CHECK:       # %bb.0:
53; CHECK-NEXT:    retq
54  %1 = lshr <4 x i32> %x, zeroinitializer
55  ret <4 x i32> %1
56}
57
58; if (srl x, c) is known to be zero, return 0
59define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
60; SSE-LABEL: combine_vec_lshr_known_zero0:
61; SSE:       # %bb.0:
62; SSE-NEXT:    xorps %xmm0, %xmm0
63; SSE-NEXT:    retq
64;
65; AVX-LABEL: combine_vec_lshr_known_zero0:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
68; AVX-NEXT:    retq
69  %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
70  %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
71  ret <4 x i32> %2
72}
73
74define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
75; SSE-LABEL: combine_vec_lshr_known_zero1:
76; SSE:       # %bb.0:
77; SSE-NEXT:    xorps %xmm0, %xmm0
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: combine_vec_lshr_known_zero1:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
85  %2 = lshr <4 x i32> %1, <i32 8, i32 9, i32 10, i32 11>
86  ret <4 x i32> %2
87}
88
89; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2))
90define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
91; SSE-LABEL: combine_vec_lshr_lshr0:
92; SSE:       # %bb.0:
93; SSE-NEXT:    psrld $6, %xmm0
94; SSE-NEXT:    retq
95;
96; AVX-LABEL: combine_vec_lshr_lshr0:
97; AVX:       # %bb.0:
98; AVX-NEXT:    vpsrld $6, %xmm0, %xmm0
99; AVX-NEXT:    retq
100  %1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
101  %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
102  ret <4 x i32> %2
103}
104
105define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
106; SSE2-LABEL: combine_vec_lshr_lshr1:
107; SSE2:       # %bb.0:
108; SSE2-NEXT:    movdqa %xmm0, %xmm1
109; SSE2-NEXT:    psrld $10, %xmm1
110; SSE2-NEXT:    movdqa %xmm0, %xmm2
111; SSE2-NEXT:    psrld $8, %xmm2
112; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
113; SSE2-NEXT:    movdqa %xmm0, %xmm1
114; SSE2-NEXT:    psrld $6, %xmm1
115; SSE2-NEXT:    psrld $4, %xmm0
116; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
117; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
118; SSE2-NEXT:    retq
119;
120; SSE41-LABEL: combine_vec_lshr_lshr1:
121; SSE41:       # %bb.0:
122; SSE41-NEXT:    movdqa %xmm0, %xmm1
123; SSE41-NEXT:    psrld $10, %xmm1
124; SSE41-NEXT:    movdqa %xmm0, %xmm2
125; SSE41-NEXT:    psrld $6, %xmm2
126; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
127; SSE41-NEXT:    movdqa %xmm0, %xmm1
128; SSE41-NEXT:    psrld $8, %xmm1
129; SSE41-NEXT:    psrld $4, %xmm0
130; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
131; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
132; SSE41-NEXT:    retq
133;
134; AVX-LABEL: combine_vec_lshr_lshr1:
135; AVX:       # %bb.0:
136; AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
137; AVX-NEXT:    retq
138  %1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
139  %2 = lshr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
140  ret <4 x i32> %2
141}
142
143; fold (srl (srl x, c1), c2) -> 0
144define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) {
145; SSE-LABEL: combine_vec_lshr_lshr_zero0:
146; SSE:       # %bb.0:
147; SSE-NEXT:    xorps %xmm0, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX-LABEL: combine_vec_lshr_lshr_zero0:
151; AVX:       # %bb.0:
152; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
153; AVX-NEXT:    retq
154  %1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
155  %2 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
156  ret <4 x i32> %2
157}
158
159define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
160; SSE-LABEL: combine_vec_lshr_lshr_zero1:
161; SSE:       # %bb.0:
162; SSE-NEXT:    xorps %xmm0, %xmm0
163; SSE-NEXT:    retq
164;
165; AVX-LABEL: combine_vec_lshr_lshr_zero1:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
168; AVX-NEXT:    retq
169  %1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
170  %2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
171  ret <4 x i32> %2
172}
173
174; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
175define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
176; SSE2-LABEL: combine_vec_lshr_trunc_lshr0:
177; SSE2:       # %bb.0:
178; SSE2-NEXT:    psrlq $48, %xmm1
179; SSE2-NEXT:    psrlq $48, %xmm0
180; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
181; SSE2-NEXT:    retq
182;
183; SSE41-LABEL: combine_vec_lshr_trunc_lshr0:
184; SSE41:       # %bb.0:
185; SSE41-NEXT:    psrlq $48, %xmm1
186; SSE41-NEXT:    psrlq $48, %xmm0
187; SSE41-NEXT:    packusdw %xmm1, %xmm0
188; SSE41-NEXT:    retq
189;
190; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr0:
191; AVX2-SLOW:       # %bb.0:
192; AVX2-SLOW-NEXT:    vpsrlq $48, %ymm0, %ymm0
193; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
194; AVX2-SLOW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
195; AVX2-SLOW-NEXT:    vzeroupper
196; AVX2-SLOW-NEXT:    retq
197;
198; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr0:
199; AVX2-FAST-ALL:       # %bb.0:
200; AVX2-FAST-ALL-NEXT:    vpsrlq $48, %ymm0, %ymm0
201; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
202; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
203; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
204; AVX2-FAST-ALL-NEXT:    vzeroupper
205; AVX2-FAST-ALL-NEXT:    retq
206;
207; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr0:
208; AVX2-FAST-PERLANE:       # %bb.0:
209; AVX2-FAST-PERLANE-NEXT:    vpsrlq $48, %ymm0, %ymm0
210; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
211; AVX2-FAST-PERLANE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
212; AVX2-FAST-PERLANE-NEXT:    vzeroupper
213; AVX2-FAST-PERLANE-NEXT:    retq
214;
215; AVX512-LABEL: combine_vec_lshr_trunc_lshr0:
216; AVX512:       # %bb.0:
217; AVX512-NEXT:    vpsrlq $48, %ymm0, %ymm0
218; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
219; AVX512-NEXT:    vzeroupper
220; AVX512-NEXT:    retq
221  %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
222  %2 = trunc <4 x i64> %1 to <4 x i32>
223  %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16>
224  ret <4 x i32> %3
225}
226
227define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
228; SSE2-LABEL: combine_vec_lshr_trunc_lshr1:
229; SSE2:       # %bb.0:
230; SSE2-NEXT:    movdqa %xmm1, %xmm2
231; SSE2-NEXT:    psrlq $34, %xmm2
232; SSE2-NEXT:    psrlq $35, %xmm1
233; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
234; SSE2-NEXT:    movdqa %xmm0, %xmm2
235; SSE2-NEXT:    psrlq $32, %xmm2
236; SSE2-NEXT:    psrlq $33, %xmm0
237; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
238; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
239; SSE2-NEXT:    movaps %xmm0, %xmm1
240; SSE2-NEXT:    psrld $19, %xmm1
241; SSE2-NEXT:    movaps %xmm0, %xmm3
242; SSE2-NEXT:    psrld $18, %xmm3
243; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
244; SSE2-NEXT:    psrld $17, %xmm0
245; SSE2-NEXT:    psrld $16, %xmm2
246; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
247; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
248; SSE2-NEXT:    movaps %xmm2, %xmm0
249; SSE2-NEXT:    retq
250;
251; SSE41-LABEL: combine_vec_lshr_trunc_lshr1:
252; SSE41:       # %bb.0:
253; SSE41-NEXT:    movdqa %xmm1, %xmm2
254; SSE41-NEXT:    psrlq $35, %xmm2
255; SSE41-NEXT:    psrlq $34, %xmm1
256; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
257; SSE41-NEXT:    movdqa %xmm0, %xmm2
258; SSE41-NEXT:    psrlq $33, %xmm2
259; SSE41-NEXT:    psrlq $32, %xmm0
260; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
261; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
262; SSE41-NEXT:    movaps %xmm2, %xmm1
263; SSE41-NEXT:    psrld $19, %xmm1
264; SSE41-NEXT:    movaps %xmm2, %xmm3
265; SSE41-NEXT:    psrld $17, %xmm3
266; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
267; SSE41-NEXT:    psrld $18, %xmm2
268; SSE41-NEXT:    psrld $16, %xmm0
269; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
270; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
271; SSE41-NEXT:    retq
272;
273; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
274; AVX2-SLOW:       # %bb.0:
275; AVX2-SLOW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
276; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
277; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
278; AVX2-SLOW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
279; AVX2-SLOW-NEXT:    vzeroupper
280; AVX2-SLOW-NEXT:    retq
281;
282; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1:
283; AVX2-FAST-ALL:       # %bb.0:
284; AVX2-FAST-ALL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
285; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
286; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
287; AVX2-FAST-ALL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
288; AVX2-FAST-ALL-NEXT:    vzeroupper
289; AVX2-FAST-ALL-NEXT:    retq
290;
291; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr1:
292; AVX2-FAST-PERLANE:       # %bb.0:
293; AVX2-FAST-PERLANE-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
294; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
295; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
296; AVX2-FAST-PERLANE-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
297; AVX2-FAST-PERLANE-NEXT:    vzeroupper
298; AVX2-FAST-PERLANE-NEXT:    retq
299;
300; AVX512-LABEL: combine_vec_lshr_trunc_lshr1:
301; AVX512:       # %bb.0:
302; AVX512-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
303; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
304; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
305; AVX512-NEXT:    vzeroupper
306; AVX512-NEXT:    retq
307  %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
308  %2 = trunc <4 x i64> %1 to <4 x i32>
309  %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
310  ret <4 x i32> %3
311}
312
313; fold (srl (trunc (srl x, c1)), c2) -> 0
314define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
315; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0:
316; SSE:       # %bb.0:
317; SSE-NEXT:    xorps %xmm0, %xmm0
318; SSE-NEXT:    retq
319;
320; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0:
321; AVX:       # %bb.0:
322; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
323; AVX-NEXT:    retq
324  %1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48>
325  %2 = trunc <4 x i64> %1 to <4 x i32>
326  %3 = lshr <4 x i32> %2, <i32 24, i32 24, i32 24, i32 24>
327  ret <4 x i32> %3
328}
329
330define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
331; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
332; SSE:       # %bb.0:
333; SSE-NEXT:    xorps %xmm0, %xmm0
334; SSE-NEXT:    retq
335;
336; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
337; AVX:       # %bb.0:
338; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
339; AVX-NEXT:    retq
340  %1 = lshr <4 x i64> %x, <i64 48, i64 49, i64 50, i64 51>
341  %2 = trunc <4 x i64> %1 to <4 x i32>
342  %3 = lshr <4 x i32> %2, <i32 24, i32 25, i32 26, i32 27>
343  ret <4 x i32> %3
344}
345
346; fold (srl (shl x, c), c) -> (and x, cst2)
347define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
348; SSE-LABEL: combine_vec_lshr_shl_mask0:
349; SSE:       # %bb.0:
350; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
351; SSE-NEXT:    retq
352;
353; AVX2-LABEL: combine_vec_lshr_shl_mask0:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
356; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
357; AVX2-NEXT:    retq
358;
359; AVX512-LABEL: combine_vec_lshr_shl_mask0:
360; AVX512:       # %bb.0:
361; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
362; AVX512-NEXT:    retq
363  %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
364  %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
365  ret <4 x i32> %2
366}
367
368define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
369; SSE-LABEL: combine_vec_lshr_shl_mask1:
370; SSE:       # %bb.0:
371; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
372; SSE-NEXT:    retq
373;
374; AVX-LABEL: combine_vec_lshr_shl_mask1:
375; AVX:       # %bb.0:
376; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
377; AVX-NEXT:    retq
378  %1 =  shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5>
379  %2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
380  ret <4 x i32> %2
381}
382
383; fold (srl (sra X, Y), 31) -> (srl X, 31)
384define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) {
385; SSE-LABEL: combine_vec_lshr_ashr_sign:
386; SSE:       # %bb.0:
387; SSE-NEXT:    psrld $31, %xmm0
388; SSE-NEXT:    retq
389;
390; AVX-LABEL: combine_vec_lshr_ashr_sign:
391; AVX:       # %bb.0:
392; AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
393; AVX-NEXT:    retq
394  %1 = ashr <4 x i32> %x, %y
395  %2 = lshr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
396  ret <4 x i32> %2
397}
398
399; fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
400define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
401; SSE-LABEL: combine_vec_lshr_lzcnt_bit0:
402; SSE:       # %bb.0:
403; SSE-NEXT:    psrld $4, %xmm0
404; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
405; SSE-NEXT:    retq
406;
407; AVX2-LABEL: combine_vec_lshr_lzcnt_bit0:
408; AVX2:       # %bb.0:
409; AVX2-NEXT:    vpsrld $4, %xmm0, %xmm0
410; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
411; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
412; AVX2-NEXT:    retq
413;
414; AVX512-LABEL: combine_vec_lshr_lzcnt_bit0:
415; AVX512:       # %bb.0:
416; AVX512-NEXT:    vpsrld $4, %xmm0, %xmm0
417; AVX512-NEXT:    vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
418; AVX512-NEXT:    retq
419  %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
420  %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
421  %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
422  ret <4 x i32> %3
423}
424
425define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
426; SSE2-LABEL: combine_vec_lshr_lzcnt_bit1:
427; SSE2:       # %bb.0:
428; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
429; SSE2-NEXT:    movdqa %xmm0, %xmm1
430; SSE2-NEXT:    psrld $1, %xmm1
431; SSE2-NEXT:    por %xmm1, %xmm0
432; SSE2-NEXT:    movdqa %xmm0, %xmm1
433; SSE2-NEXT:    psrld $2, %xmm1
434; SSE2-NEXT:    por %xmm1, %xmm0
435; SSE2-NEXT:    movdqa %xmm0, %xmm1
436; SSE2-NEXT:    psrld $4, %xmm1
437; SSE2-NEXT:    por %xmm1, %xmm0
438; SSE2-NEXT:    movdqa %xmm0, %xmm1
439; SSE2-NEXT:    psrld $8, %xmm1
440; SSE2-NEXT:    por %xmm1, %xmm0
441; SSE2-NEXT:    movdqa %xmm0, %xmm1
442; SSE2-NEXT:    psrld $16, %xmm1
443; SSE2-NEXT:    por %xmm1, %xmm0
444; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
445; SSE2-NEXT:    pxor %xmm1, %xmm0
446; SSE2-NEXT:    movdqa %xmm0, %xmm1
447; SSE2-NEXT:    psrlw $1, %xmm1
448; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
449; SSE2-NEXT:    psubb %xmm1, %xmm0
450; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
451; SSE2-NEXT:    movdqa %xmm0, %xmm2
452; SSE2-NEXT:    pand %xmm1, %xmm2
453; SSE2-NEXT:    psrlw $2, %xmm0
454; SSE2-NEXT:    pand %xmm1, %xmm0
455; SSE2-NEXT:    paddb %xmm2, %xmm0
456; SSE2-NEXT:    movdqa %xmm0, %xmm1
457; SSE2-NEXT:    psrlw $4, %xmm1
458; SSE2-NEXT:    paddb %xmm1, %xmm0
459; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
460; SSE2-NEXT:    pxor %xmm1, %xmm1
461; SSE2-NEXT:    movdqa %xmm0, %xmm2
462; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
463; SSE2-NEXT:    psadbw %xmm1, %xmm2
464; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
465; SSE2-NEXT:    psadbw %xmm1, %xmm0
466; SSE2-NEXT:    packuswb %xmm2, %xmm0
467; SSE2-NEXT:    psrld $5, %xmm0
468; SSE2-NEXT:    retq
469;
470; SSE41-LABEL: combine_vec_lshr_lzcnt_bit1:
471; SSE41:       # %bb.0:
472; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
473; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
474; SSE41-NEXT:    movdqa %xmm1, %xmm2
475; SSE41-NEXT:    pshufb %xmm0, %xmm2
476; SSE41-NEXT:    psrlw $4, %xmm0
477; SSE41-NEXT:    pxor %xmm3, %xmm3
478; SSE41-NEXT:    pshufb %xmm0, %xmm1
479; SSE41-NEXT:    pcmpeqb %xmm3, %xmm0
480; SSE41-NEXT:    pand %xmm2, %xmm0
481; SSE41-NEXT:    paddb %xmm1, %xmm0
482; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
483; SSE41-NEXT:    pand %xmm0, %xmm1
484; SSE41-NEXT:    psrlw $8, %xmm0
485; SSE41-NEXT:    paddw %xmm1, %xmm0
486; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
487; SSE41-NEXT:    psrld $16, %xmm0
488; SSE41-NEXT:    paddd %xmm3, %xmm0
489; SSE41-NEXT:    psrld $5, %xmm0
490; SSE41-NEXT:    retq
491;
492; AVX2-LABEL: combine_vec_lshr_lzcnt_bit1:
493; AVX2:       # %bb.0:
494; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
495; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
496; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
497; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
498; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
499; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
500; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
501; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
502; AVX2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
503; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
504; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
505; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
506; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
507; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
508; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
509; AVX2-NEXT:    vpsrld $5, %xmm0, %xmm0
510; AVX2-NEXT:    retq
511;
512; AVX512-LABEL: combine_vec_lshr_lzcnt_bit1:
513; AVX512:       # %bb.0:
514; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
515; AVX512-NEXT:    vplzcntd %xmm0, %xmm0
516; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
517; AVX512-NEXT:    retq
518  %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>
519  %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
520  %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
521  ret <4 x i32> %3
522}
523declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
524
525; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
526define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
527; SSE2-LABEL: combine_vec_lshr_trunc_and:
528; SSE2:       # %bb.0:
529; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
530; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
531; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
532; SSE2-NEXT:    movdqa %xmm0, %xmm3
533; SSE2-NEXT:    psrld %xmm2, %xmm3
534; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
535; SSE2-NEXT:    movdqa %xmm0, %xmm2
536; SSE2-NEXT:    psrld %xmm4, %xmm2
537; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
538; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
539; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
540; SSE2-NEXT:    movdqa %xmm0, %xmm4
541; SSE2-NEXT:    psrld %xmm3, %xmm4
542; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
543; SSE2-NEXT:    psrld %xmm1, %xmm0
544; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
545; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
546; SSE2-NEXT:    movaps %xmm2, %xmm0
547; SSE2-NEXT:    retq
548;
549; SSE41-LABEL: combine_vec_lshr_trunc_and:
550; SSE41:       # %bb.0:
551; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
552; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
553; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
554; SSE41-NEXT:    movdqa %xmm0, %xmm3
555; SSE41-NEXT:    psrld %xmm2, %xmm3
556; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
557; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
558; SSE41-NEXT:    movdqa %xmm0, %xmm5
559; SSE41-NEXT:    psrld %xmm4, %xmm5
560; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
561; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
562; SSE41-NEXT:    movdqa %xmm0, %xmm3
563; SSE41-NEXT:    psrld %xmm1, %xmm3
564; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
565; SSE41-NEXT:    psrld %xmm1, %xmm0
566; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
567; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
568; SSE41-NEXT:    retq
569;
570; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
571; AVX2-SLOW:       # %bb.0:
572; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
573; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
574; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
575; AVX2-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
576; AVX2-SLOW-NEXT:    vzeroupper
577; AVX2-SLOW-NEXT:    retq
578;
579; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and:
580; AVX2-FAST-ALL:       # %bb.0:
581; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
582; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
583; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
584; AVX2-FAST-ALL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
585; AVX2-FAST-ALL-NEXT:    vzeroupper
586; AVX2-FAST-ALL-NEXT:    retq
587;
588; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_and:
589; AVX2-FAST-PERLANE:       # %bb.0:
590; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
591; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
592; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
593; AVX2-FAST-PERLANE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
594; AVX2-FAST-PERLANE-NEXT:    vzeroupper
595; AVX2-FAST-PERLANE-NEXT:    retq
596;
597; AVX512-LABEL: combine_vec_lshr_trunc_and:
598; AVX512:       # %bb.0:
599; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
600; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
601; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
602; AVX512-NEXT:    vzeroupper
603; AVX512-NEXT:    retq
604  %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
605  %2 = trunc <4 x i64> %1 to <4 x i32>
606  %3 = lshr <4 x i32> %x, %2
607  ret <4 x i32> %3
608}
609
610define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
611; SSE2-LABEL: combine_vec_lshr_clamped1:
612; SSE2:       # %bb.0:
613; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
614; SSE2-NEXT:    pxor %xmm1, %xmm2
615; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
616; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
617; SSE2-NEXT:    movdqa %xmm0, %xmm4
618; SSE2-NEXT:    psrld %xmm3, %xmm4
619; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
620; SSE2-NEXT:    movdqa %xmm0, %xmm5
621; SSE2-NEXT:    psrld %xmm3, %xmm5
622; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
623; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
624; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
625; SSE2-NEXT:    movdqa %xmm0, %xmm4
626; SSE2-NEXT:    psrld %xmm3, %xmm4
627; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
628; SSE2-NEXT:    psrld %xmm1, %xmm0
629; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
630; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3]
631; SSE2-NEXT:    pandn %xmm5, %xmm2
632; SSE2-NEXT:    movdqa %xmm2, %xmm0
633; SSE2-NEXT:    retq
634;
635; SSE41-LABEL: combine_vec_lshr_clamped1:
636; SSE41:       # %bb.0:
637; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
638; SSE41-NEXT:    pminud %xmm1, %xmm2
639; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
640; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
641; SSE41-NEXT:    movdqa %xmm0, %xmm4
642; SSE41-NEXT:    psrld %xmm3, %xmm4
643; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
644; SSE41-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
645; SSE41-NEXT:    movdqa %xmm0, %xmm6
646; SSE41-NEXT:    psrld %xmm5, %xmm6
647; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
648; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
649; SSE41-NEXT:    movdqa %xmm0, %xmm4
650; SSE41-NEXT:    psrld %xmm1, %xmm4
651; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
652; SSE41-NEXT:    psrld %xmm1, %xmm0
653; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
654; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
655; SSE41-NEXT:    pand %xmm2, %xmm0
656; SSE41-NEXT:    retq
657;
658; AVX-LABEL: combine_vec_lshr_clamped1:
659; AVX:       # %bb.0:
660; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
661; AVX-NEXT:    retq
662  %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
663  %shr = lshr <4 x i32> %sh, %amt
664  %1 = select <4 x i1> %cmp.i, <4 x i32> %shr, <4 x i32> zeroinitializer
665  ret <4 x i32> %1
666}
667
668define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
669; SSE2-LABEL: combine_vec_lshr_clamped2:
670; SSE2:       # %bb.0:
671; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
672; SSE2-NEXT:    pxor %xmm1, %xmm2
673; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
674; SSE2-NEXT:    pandn %xmm0, %xmm2
675; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
676; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
677; SSE2-NEXT:    movdqa %xmm2, %xmm4
678; SSE2-NEXT:    psrld %xmm3, %xmm4
679; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
680; SSE2-NEXT:    movdqa %xmm2, %xmm3
681; SSE2-NEXT:    psrld %xmm0, %xmm3
682; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
683; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
684; SSE2-NEXT:    movdqa %xmm2, %xmm4
685; SSE2-NEXT:    psrld %xmm0, %xmm4
686; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
687; SSE2-NEXT:    psrld %xmm0, %xmm2
688; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
689; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
690; SSE2-NEXT:    movaps %xmm2, %xmm0
691; SSE2-NEXT:    retq
692;
693; SSE41-LABEL: combine_vec_lshr_clamped2:
694; SSE41:       # %bb.0:
695; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
696; SSE41-NEXT:    pminud %xmm1, %xmm2
697; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
698; SSE41-NEXT:    pand %xmm2, %xmm0
699; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
700; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
701; SSE41-NEXT:    movdqa %xmm0, %xmm4
702; SSE41-NEXT:    psrld %xmm3, %xmm4
703; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
704; SSE41-NEXT:    movdqa %xmm0, %xmm5
705; SSE41-NEXT:    psrld %xmm3, %xmm5
706; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
707; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
708; SSE41-NEXT:    movdqa %xmm0, %xmm3
709; SSE41-NEXT:    psrld %xmm2, %xmm3
710; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
711; SSE41-NEXT:    psrld %xmm1, %xmm0
712; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
713; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
714; SSE41-NEXT:    retq
715;
716; AVX-LABEL: combine_vec_lshr_clamped2:
717; AVX:       # %bb.0:
718; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
719; AVX-NEXT:    retq
720  %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
721  %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
722  %shr = lshr <4 x i32> %1, %amt
723  ret <4 x i32> %shr
724}
725
726define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
727; SSE2-LABEL: combine_vec_lshr_commuted_clamped:
728; SSE2:       # %bb.0:
729; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
730; SSE2-NEXT:    pxor %xmm1, %xmm2
731; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
732; SSE2-NEXT:    pandn %xmm0, %xmm2
733; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
734; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
735; SSE2-NEXT:    movdqa %xmm2, %xmm4
736; SSE2-NEXT:    psrld %xmm3, %xmm4
737; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
738; SSE2-NEXT:    movdqa %xmm2, %xmm3
739; SSE2-NEXT:    psrld %xmm0, %xmm3
740; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
741; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
742; SSE2-NEXT:    movdqa %xmm2, %xmm4
743; SSE2-NEXT:    psrld %xmm0, %xmm4
744; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
745; SSE2-NEXT:    psrld %xmm0, %xmm2
746; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
747; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
748; SSE2-NEXT:    movaps %xmm2, %xmm0
749; SSE2-NEXT:    retq
750;
751; SSE41-LABEL: combine_vec_lshr_commuted_clamped:
752; SSE41:       # %bb.0:
753; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
754; SSE41-NEXT:    pminud %xmm1, %xmm2
755; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
756; SSE41-NEXT:    pand %xmm2, %xmm0
757; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
758; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
759; SSE41-NEXT:    movdqa %xmm0, %xmm4
760; SSE41-NEXT:    psrld %xmm3, %xmm4
761; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
762; SSE41-NEXT:    movdqa %xmm0, %xmm5
763; SSE41-NEXT:    psrld %xmm3, %xmm5
764; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
765; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
766; SSE41-NEXT:    movdqa %xmm0, %xmm3
767; SSE41-NEXT:    psrld %xmm2, %xmm3
768; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
769; SSE41-NEXT:    psrld %xmm1, %xmm0
770; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
771; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
772; SSE41-NEXT:    retq
773;
774; AVX-LABEL: combine_vec_lshr_commuted_clamped:
775; AVX:       # %bb.0:
776; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
777; AVX-NEXT:    retq
778  %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
779  %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
780  %shr = lshr <4 x i32> %1, %amt
781  ret <4 x i32> %shr
782}
783
784define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
785; SSE2-LABEL: combine_vec_lshr_commuted_clamped1:
786; SSE2:       # %bb.0:
787; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
788; SSE2-NEXT:    movdqa %xmm0, %xmm3
789; SSE2-NEXT:    psrld %xmm2, %xmm3
790; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
791; SSE2-NEXT:    movdqa %xmm0, %xmm4
792; SSE2-NEXT:    psrld %xmm2, %xmm4
793; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
794; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
795; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
796; SSE2-NEXT:    movdqa %xmm0, %xmm5
797; SSE2-NEXT:    psrld %xmm3, %xmm5
798; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
799; SSE2-NEXT:    psrld %xmm2, %xmm0
800; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1]
801; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
802; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
803; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
804; SSE2-NEXT:    pandn %xmm4, %xmm1
805; SSE2-NEXT:    movdqa %xmm1, %xmm0
806; SSE2-NEXT:    retq
807;
808; SSE41-LABEL: combine_vec_lshr_commuted_clamped1:
809; SSE41:       # %bb.0:
810; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
811; SSE41-NEXT:    movdqa %xmm0, %xmm3
812; SSE41-NEXT:    psrld %xmm2, %xmm3
813; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
814; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
815; SSE41-NEXT:    movdqa %xmm0, %xmm5
816; SSE41-NEXT:    psrld %xmm4, %xmm5
817; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
818; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
819; SSE41-NEXT:    movdqa %xmm0, %xmm4
820; SSE41-NEXT:    psrld %xmm3, %xmm4
821; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
822; SSE41-NEXT:    psrld %xmm2, %xmm0
823; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
824; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
825; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
826; SSE41-NEXT:    pminud %xmm1, %xmm2
827; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
828; SSE41-NEXT:    pand %xmm2, %xmm0
829; SSE41-NEXT:    retq
830;
831; AVX-LABEL: combine_vec_lshr_commuted_clamped1:
832; AVX:       # %bb.0:
833; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
834; AVX-NEXT:    retq
835  %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
836  %shr = lshr <4 x i32> %sh, %amt
837  %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
838  ret <4 x i32> %1
839}
840