xref: /llvm-project/llvm/test/CodeGen/X86/combine-shl.ll (revision b320d3733dfb76c1b7d78fc499490d34b99e2284)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
8
9; fold (shl 0, x) -> 0
10define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
11; SSE-LABEL: combine_vec_shl_zero:
12; SSE:       # %bb.0:
13; SSE-NEXT:    xorps %xmm0, %xmm0
14; SSE-NEXT:    retq
15;
16; AVX-LABEL: combine_vec_shl_zero:
17; AVX:       # %bb.0:
18; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
19; AVX-NEXT:    retq
20  %1 = shl <4 x i32> zeroinitializer, %x
21  ret <4 x i32> %1
22}
23
24; fold (shl x, c >= size(x)) -> undef
25define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
26; CHECK-LABEL: combine_vec_shl_outofrange0:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    retq
29  %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
30  ret <4 x i32> %1
31}
32
33define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
34; CHECK-LABEL: combine_vec_shl_outofrange1:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    retq
37  %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
38  ret <4 x i32> %1
39}
40
41define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
42; CHECK-LABEL: combine_vec_shl_outofrange2:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    retq
45  %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
46  %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
47  ret <4 x i32> %2
48}
49
50define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) {
51; CHECK-LABEL: combine_vec_shl_outofrange3:
52; CHECK:       # %bb.0:
53; CHECK-NEXT:    retq
54  %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef>
55  ret <4 x i32> %1
56}
57
58; fold (shl x, 0) -> x
59define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
60; CHECK-LABEL: combine_vec_shl_by_zero:
61; CHECK:       # %bb.0:
62; CHECK-NEXT:    retq
63  %1 = shl <4 x i32> %x, zeroinitializer
64  ret <4 x i32> %1
65}
66
67; if (shl x, c) is known to be zero, return 0
68define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
69; SSE-LABEL: combine_vec_shl_known_zero0:
70; SSE:       # %bb.0:
71; SSE-NEXT:    xorps %xmm0, %xmm0
72; SSE-NEXT:    retq
73;
74; AVX-LABEL: combine_vec_shl_known_zero0:
75; AVX:       # %bb.0:
76; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
77; AVX-NEXT:    retq
78  %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
79  %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
80  ret <4 x i32> %2
81}
82
83define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
84; SSE2-LABEL: combine_vec_shl_known_zero1:
85; SSE2:       # %bb.0:
86; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
87; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192]
88; SSE2-NEXT:    pmuludq %xmm0, %xmm1
89; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
90; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
91; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
93; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
94; SSE2-NEXT:    movdqa %xmm1, %xmm0
95; SSE2-NEXT:    retq
96;
97; SSE41-LABEL: combine_vec_shl_known_zero1:
98; SSE41:       # %bb.0:
99; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
100; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
101; SSE41-NEXT:    retq
102;
103; AVX-LABEL: combine_vec_shl_known_zero1:
104; AVX:       # %bb.0:
105; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
106; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
107; AVX-NEXT:    retq
108  %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080>
109  %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13>
110  ret <4 x i32> %2
111}
112
113; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
114define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
115; SSE2-LABEL: combine_vec_shl_trunc_and:
116; SSE2:       # %bb.0:
117; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
118; SSE2-NEXT:    pslld $23, %xmm1
119; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
120; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
121; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
122; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
123; SSE2-NEXT:    pmuludq %xmm1, %xmm0
124; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
125; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
126; SSE2-NEXT:    pmuludq %xmm2, %xmm1
127; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
128; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
129; SSE2-NEXT:    retq
130;
131; SSE41-LABEL: combine_vec_shl_trunc_and:
132; SSE41:       # %bb.0:
133; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
134; SSE41-NEXT:    pslld $23, %xmm1
135; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
136; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
137; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
138; SSE41-NEXT:    pmulld %xmm1, %xmm0
139; SSE41-NEXT:    retq
140;
141; AVX2-SLOW-LABEL: combine_vec_shl_trunc_and:
142; AVX2-SLOW:       # %bb.0:
143; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
144; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
145; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
146; AVX2-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
147; AVX2-SLOW-NEXT:    vzeroupper
148; AVX2-SLOW-NEXT:    retq
149;
150; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
151; AVX2-FAST-ALL:       # %bb.0:
152; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
153; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
154; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
155; AVX2-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
156; AVX2-FAST-ALL-NEXT:    vzeroupper
157; AVX2-FAST-ALL-NEXT:    retq
158;
159; AVX2-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
160; AVX2-FAST-PERLANE:       # %bb.0:
161; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
162; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
163; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
164; AVX2-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
165; AVX2-FAST-PERLANE-NEXT:    vzeroupper
166; AVX2-FAST-PERLANE-NEXT:    retq
167;
168; AVX512-LABEL: combine_vec_shl_trunc_and:
169; AVX512:       # %bb.0:
170; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
171; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
172; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
173; AVX512-NEXT:    vzeroupper
174; AVX512-NEXT:    retq
175  %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
176  %2 = trunc <4 x i64> %1 to <4 x i32>
177  %3 = shl <4 x i32> %x, %2
178  ret <4 x i32> %3
179}
180
181; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
182define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
183; SSE-LABEL: combine_vec_shl_shl0:
184; SSE:       # %bb.0:
185; SSE-NEXT:    pslld $6, %xmm0
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: combine_vec_shl_shl0:
189; AVX:       # %bb.0:
190; AVX-NEXT:    vpslld $6, %xmm0, %xmm0
191; AVX-NEXT:    retq
192  %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
193  %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
194  ret <4 x i32> %2
195}
196
197define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
198; SSE2-LABEL: combine_vec_shl_shl1:
199; SSE2:       # %bb.0:
200; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
201; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
202; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
203; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
204; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
205; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
206; SSE2-NEXT:    retq
207;
208; SSE41-LABEL: combine_vec_shl_shl1:
209; SSE41:       # %bb.0:
210; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
211; SSE41-NEXT:    retq
212;
213; AVX-LABEL: combine_vec_shl_shl1:
214; AVX:       # %bb.0:
215; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
216; AVX-NEXT:    retq
217  %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
218  %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
219  ret <4 x i32> %2
220}
221
222; fold (shl (shl x, c1), c2) -> 0
223define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
224; SSE-LABEL: combine_vec_shl_shlr_zero0:
225; SSE:       # %bb.0:
226; SSE-NEXT:    xorps %xmm0, %xmm0
227; SSE-NEXT:    retq
228;
229; AVX-LABEL: combine_vec_shl_shlr_zero0:
230; AVX:       # %bb.0:
231; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
232; AVX-NEXT:    retq
233  %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
234  %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
235  ret <4 x i32> %2
236}
237
238define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
239; SSE-LABEL: combine_vec_shl_shl_zero1:
240; SSE:       # %bb.0:
241; SSE-NEXT:    xorps %xmm0, %xmm0
242; SSE-NEXT:    retq
243;
244; AVX-LABEL: combine_vec_shl_shl_zero1:
245; AVX:       # %bb.0:
246; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
247; AVX-NEXT:    retq
248  %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
249  %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
250  ret <4 x i32> %2
251}
252
253; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
254define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
255; SSE2-LABEL: combine_vec_shl_ext_shl0:
256; SSE2:       # %bb.0:
257; SSE2-NEXT:    movdqa %xmm0, %xmm1
258; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
259; SSE2-NEXT:    pslld $20, %xmm0
260; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
261; SSE2-NEXT:    pslld $20, %xmm1
262; SSE2-NEXT:    retq
263;
264; SSE41-LABEL: combine_vec_shl_ext_shl0:
265; SSE41:       # %bb.0:
266; SSE41-NEXT:    movdqa %xmm0, %xmm1
267; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
268; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
269; SSE41-NEXT:    pslld $20, %xmm1
270; SSE41-NEXT:    pslld $20, %xmm0
271; SSE41-NEXT:    retq
272;
273; AVX-LABEL: combine_vec_shl_ext_shl0:
274; AVX:       # %bb.0:
275; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
276; AVX-NEXT:    vpslld $20, %ymm0, %ymm0
277; AVX-NEXT:    retq
278  %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
279  %2 = sext <8 x i16> %1 to <8 x i32>
280  %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
281  ret <8 x i32> %3
282}
283
284define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
285; SSE-LABEL: combine_vec_shl_ext_shl1:
286; SSE:       # %bb.0:
287; SSE-NEXT:    xorps %xmm0, %xmm0
288; SSE-NEXT:    xorps %xmm1, %xmm1
289; SSE-NEXT:    retq
290;
291; AVX-LABEL: combine_vec_shl_ext_shl1:
292; AVX:       # %bb.0:
293; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
294; AVX-NEXT:    retq
295  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
296  %2 = sext <8 x i16> %1 to <8 x i32>
297  %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28>
298  ret <8 x i32> %3
299}
300
301define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
302; SSE2-LABEL: combine_vec_shl_ext_shl2:
303; SSE2:       # %bb.0:
304; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
305; SSE2-NEXT:    psrad $16, %xmm1
306; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
307; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
308; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
309; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
310; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
311; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
312; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
313; SSE2-NEXT:    psrad $16, %xmm0
314; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
315; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
316; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
317; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
318; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
319; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
320; SSE2-NEXT:    movdqa %xmm2, %xmm0
321; SSE2-NEXT:    retq
322;
323; SSE41-LABEL: combine_vec_shl_ext_shl2:
324; SSE41:       # %bb.0:
325; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
326; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
327; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
328; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
329; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
330; SSE41-NEXT:    movdqa %xmm2, %xmm0
331; SSE41-NEXT:    retq
332;
333; AVX-LABEL: combine_vec_shl_ext_shl2:
334; AVX:       # %bb.0:
335; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
336; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
337; AVX-NEXT:    retq
338  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
339  %2 = sext <8 x i16> %1 to <8 x i32>
340  %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
341  ret <8 x i32> %3
342}
343
344; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
345define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
346; SSE2-LABEL: combine_vec_shl_zext_lshr0:
347; SSE2:       # %bb.0:
348; SSE2-NEXT:    movdqa %xmm0, %xmm1
349; SSE2-NEXT:    pxor %xmm2, %xmm2
350; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
351; SSE2-NEXT:    movdqa %xmm1, %xmm0
352; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
353; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
354; SSE2-NEXT:    retq
355;
356; SSE41-LABEL: combine_vec_shl_zext_lshr0:
357; SSE41:       # %bb.0:
358; SSE41-NEXT:    movdqa %xmm0, %xmm1
359; SSE41-NEXT:    pxor %xmm2, %xmm2
360; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
361; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
362; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
363; SSE41-NEXT:    retq
364;
365; AVX2-LABEL: combine_vec_shl_zext_lshr0:
366; AVX2:       # %bb.0:
367; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
368; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
369; AVX2-NEXT:    retq
370;
371; AVX512-LABEL: combine_vec_shl_zext_lshr0:
372; AVX512:       # %bb.0:
373; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
374; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
375; AVX512-NEXT:    retq
376  %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
377  %2 = zext <8 x i16> %1 to <8 x i32>
378  %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
379  ret <8 x i32> %3
380}
381
382define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
383; SSE2-LABEL: combine_vec_shl_zext_lshr1:
384; SSE2:       # %bb.0:
385; SSE2-NEXT:    movdqa %xmm0, %xmm1
386; SSE2-NEXT:    pxor %xmm2, %xmm2
387; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
388; SSE2-NEXT:    movdqa %xmm1, %xmm0
389; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
390; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
391; SSE2-NEXT:    retq
392;
393; SSE41-LABEL: combine_vec_shl_zext_lshr1:
394; SSE41:       # %bb.0:
395; SSE41-NEXT:    movdqa %xmm0, %xmm1
396; SSE41-NEXT:    pxor %xmm2, %xmm2
397; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
398; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
399; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
400; SSE41-NEXT:    retq
401;
402; AVX-LABEL: combine_vec_shl_zext_lshr1:
403; AVX:       # %bb.0:
404; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
405; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
406; AVX-NEXT:    retq
407  %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 15>
408  %2 = zext <8 x i16> %1 to <8 x i32>
409  %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15>
410  ret <8 x i32> %3
411}
412
413; fold (shl (sr[la] exact X,  C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
414define <4 x i32> @combine_vec_shl_ge_ashr_exact0(<4 x i32> %x) {
415; SSE-LABEL: combine_vec_shl_ge_ashr_exact0:
416; SSE:       # %bb.0:
417; SSE-NEXT:    pslld $2, %xmm0
418; SSE-NEXT:    retq
419;
420; AVX-LABEL: combine_vec_shl_ge_ashr_exact0:
421; AVX:       # %bb.0:
422; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
423; AVX-NEXT:    retq
424  %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
425  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
426  ret <4 x i32> %2
427}
428
429define <4 x i32> @combine_vec_shl_ge_ashr_exact1(<4 x i32> %x) {
430; SSE2-LABEL: combine_vec_shl_ge_ashr_exact1:
431; SSE2:       # %bb.0:
432; SSE2-NEXT:    movdqa %xmm0, %xmm1
433; SSE2-NEXT:    pslld $2, %xmm1
434; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
435; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
436; SSE2-NEXT:    movaps %xmm1, %xmm0
437; SSE2-NEXT:    retq
438;
439; SSE41-LABEL: combine_vec_shl_ge_ashr_exact1:
440; SSE41:       # %bb.0:
441; SSE41-NEXT:    movdqa %xmm0, %xmm1
442; SSE41-NEXT:    pslld $2, %xmm1
443; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
444; SSE41-NEXT:    retq
445;
446; AVX-LABEL: combine_vec_shl_ge_ashr_exact1:
447; AVX:       # %bb.0:
448; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
449; AVX-NEXT:    retq
450  %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
451  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
452  ret <4 x i32> %2
453}
454
455; fold (shl (sr[la] exact SEL(X,Y),  C1), C2) -> (shl SEL(X,Y), (C2-C1)) if C1 <= C2
456define i32 @combine_shl_ge_sel_ashr_exact0(i32 %x, i32 %y, i32 %z) {
457; CHECK-LABEL: combine_shl_ge_sel_ashr_exact0:
458; CHECK:       # %bb.0:
459; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
460; CHECK-NEXT:    testl %edx, %edx
461; CHECK-NEXT:    cmovel %esi, %edi
462; CHECK-NEXT:    leal (,%rdi,4), %eax
463; CHECK-NEXT:    retq
464  %cmp = icmp ne i32 %z, 0
465  %ashrx = ashr exact i32 %x, 3
466  %ashry = ashr exact i32 %y, 3
467  %sel = select i1 %cmp, i32 %ashrx, i32 %ashry
468  %shl = shl i32 %sel, 5
469  ret i32 %shl
470}
471
472; fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
473define <4 x i32> @combine_vec_shl_lt_ashr_exact0(<4 x i32> %x) {
474; SSE-LABEL: combine_vec_shl_lt_ashr_exact0:
475; SSE:       # %bb.0:
476; SSE-NEXT:    psrad $2, %xmm0
477; SSE-NEXT:    retq
478;
479; AVX-LABEL: combine_vec_shl_lt_ashr_exact0:
480; AVX:       # %bb.0:
481; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
482; AVX-NEXT:    retq
483  %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
484  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
485  ret <4 x i32> %2
486}
487
488define <4 x i32> @combine_vec_shl_lt_ashr_exact1(<4 x i32> %x) {
489; SSE2-LABEL: combine_vec_shl_lt_ashr_exact1:
490; SSE2:       # %bb.0:
491; SSE2-NEXT:    movdqa %xmm0, %xmm1
492; SSE2-NEXT:    psrad $2, %xmm1
493; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
494; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
495; SSE2-NEXT:    movaps %xmm1, %xmm0
496; SSE2-NEXT:    retq
497;
498; SSE41-LABEL: combine_vec_shl_lt_ashr_exact1:
499; SSE41:       # %bb.0:
500; SSE41-NEXT:    movdqa %xmm0, %xmm1
501; SSE41-NEXT:    psrad $2, %xmm1
502; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
503; SSE41-NEXT:    retq
504;
505; AVX-LABEL: combine_vec_shl_lt_ashr_exact1:
506; AVX:       # %bb.0:
507; AVX-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
508; AVX-NEXT:    retq
509  %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
510  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
511  ret <4 x i32> %2
512}
513
514; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
515define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
516; SSE-LABEL: combine_vec_shl_gt_lshr0:
517; SSE:       # %bb.0:
518; SSE-NEXT:    pslld $2, %xmm0
519; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
520; SSE-NEXT:    retq
521;
522; AVX2-LABEL: combine_vec_shl_gt_lshr0:
523; AVX2:       # %bb.0:
524; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
525; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
526; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
527; AVX2-NEXT:    retq
528;
529; AVX512-LABEL: combine_vec_shl_gt_lshr0:
530; AVX512:       # %bb.0:
531; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
532; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
533; AVX512-NEXT:    retq
534  %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
535  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
536  ret <4 x i32> %2
537}
538
539define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
540; SSE-LABEL: combine_vec_shl_gt_lshr1:
541; SSE:       # %bb.0:
542; SSE-NEXT:    pslld $2, %xmm0
543; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
544; SSE-NEXT:    retq
545;
546; AVX-LABEL: combine_vec_shl_gt_lshr1:
547; AVX:       # %bb.0:
548; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
549; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
550; AVX-NEXT:    retq
551  %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 29>
552  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 31>
553  ret <4 x i32> %2
554}
555
556; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
557define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
558; SSE-LABEL: combine_vec_shl_le_lshr0:
559; SSE:       # %bb.0:
560; SSE-NEXT:    psrld $2, %xmm0
561; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
562; SSE-NEXT:    retq
563;
564; AVX2-LABEL: combine_vec_shl_le_lshr0:
565; AVX2:       # %bb.0:
566; AVX2-NEXT:    vpsrld $2, %xmm0, %xmm0
567; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
568; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
569; AVX2-NEXT:    retq
570;
571; AVX512-LABEL: combine_vec_shl_le_lshr0:
572; AVX512:       # %bb.0:
573; AVX512-NEXT:    vpsrld $2, %xmm0, %xmm0
574; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
575; AVX512-NEXT:    retq
576  %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
577  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
578  ret <4 x i32> %2
579}
580
581define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
582; SSE2-LABEL: combine_vec_shl_le_lshr1:
583; SSE2:       # %bb.0:
584; SSE2-NEXT:    movdqa %xmm0, %xmm1
585; SSE2-NEXT:    psrld $2, %xmm1
586; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
587; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
588; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
589; SSE2-NEXT:    movaps %xmm1, %xmm0
590; SSE2-NEXT:    retq
591;
592; SSE41-LABEL: combine_vec_shl_le_lshr1:
593; SSE41:       # %bb.0:
594; SSE41-NEXT:    movdqa %xmm0, %xmm1
595; SSE41-NEXT:    psrld $2, %xmm1
596; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
597; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
598; SSE41-NEXT:    retq
599;
600; AVX-LABEL: combine_vec_shl_le_lshr1:
601; AVX:       # %bb.0:
602; AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
603; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
604; AVX-NEXT:    retq
605  %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
606  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
607  ret <4 x i32> %2
608}
609
610; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
611define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
612; SSE-LABEL: combine_vec_shl_ashr0:
613; SSE:       # %bb.0:
614; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
615; SSE-NEXT:    retq
616;
617; AVX2-LABEL: combine_vec_shl_ashr0:
618; AVX2:       # %bb.0:
619; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
620; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
621; AVX2-NEXT:    retq
622;
623; AVX512-LABEL: combine_vec_shl_ashr0:
624; AVX512:       # %bb.0:
625; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
626; AVX512-NEXT:    retq
627  %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
628  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
629  ret <4 x i32> %2
630}
631
632define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
633; SSE-LABEL: combine_vec_shl_ashr1:
634; SSE:       # %bb.0:
635; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
636; SSE-NEXT:    retq
637;
638; AVX-LABEL: combine_vec_shl_ashr1:
639; AVX:       # %bb.0:
640; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
641; AVX-NEXT:    retq
642  %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
643  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
644  ret <4 x i32> %2
645}
646
647; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
648define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
649; SSE-LABEL: combine_vec_shl_add0:
650; SSE:       # %bb.0:
651; SSE-NEXT:    pslld $2, %xmm0
652; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
653; SSE-NEXT:    retq
654;
655; AVX2-LABEL: combine_vec_shl_add0:
656; AVX2:       # %bb.0:
657; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
658; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
659; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
660; AVX2-NEXT:    retq
661;
662; AVX512-LABEL: combine_vec_shl_add0:
663; AVX512:       # %bb.0:
664; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
665; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
666; AVX512-NEXT:    retq
667  %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
668  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
669  ret <4 x i32> %2
670}
671
672define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
673; SSE2-LABEL: combine_vec_shl_add1:
674; SSE2:       # %bb.0:
675; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
676; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
677; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
678; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
679; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
680; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
681; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
682; SSE2-NEXT:    retq
683;
684; SSE41-LABEL: combine_vec_shl_add1:
685; SSE41:       # %bb.0:
686; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
687; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
688; SSE41-NEXT:    retq
689;
690; AVX-LABEL: combine_vec_shl_add1:
691; AVX:       # %bb.0:
692; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
693; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
694; AVX-NEXT:    retq
695  %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
696  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
697  ret <4 x i32> %2
698}
699
700; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
701define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
702; SSE-LABEL: combine_vec_shl_or0:
703; SSE:       # %bb.0:
704; SSE-NEXT:    pslld $2, %xmm0
705; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
706; SSE-NEXT:    retq
707;
708; AVX2-LABEL: combine_vec_shl_or0:
709; AVX2:       # %bb.0:
710; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
711; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
712; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
713; AVX2-NEXT:    retq
714;
715; AVX512-LABEL: combine_vec_shl_or0:
716; AVX512:       # %bb.0:
717; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
718; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
719; AVX512-NEXT:    retq
720  %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
721  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
722  ret <4 x i32> %2
723}
724
725define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
726; SSE2-LABEL: combine_vec_shl_or1:
727; SSE2:       # %bb.0:
728; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
729; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
730; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
731; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
732; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
733; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
734; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
735; SSE2-NEXT:    retq
736;
737; SSE41-LABEL: combine_vec_shl_or1:
738; SSE41:       # %bb.0:
739; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
740; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
741; SSE41-NEXT:    retq
742;
743; AVX-LABEL: combine_vec_shl_or1:
744; AVX:       # %bb.0:
745; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
746; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
747; AVX-NEXT:    retq
748  %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
749  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
750  ret <4 x i32> %2
751}
752
753; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
754define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
755; SSE2-LABEL: combine_vec_shl_mul0:
756; SSE2:       # %bb.0:
757; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [20,20,20,20]
758; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
759; SSE2-NEXT:    pmuludq %xmm1, %xmm0
760; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
761; SSE2-NEXT:    pmuludq %xmm1, %xmm2
762; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
763; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
764; SSE2-NEXT:    retq
765;
766; SSE41-LABEL: combine_vec_shl_mul0:
767; SSE41:       # %bb.0:
768; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
769; SSE41-NEXT:    retq
770;
771; AVX2-LABEL: combine_vec_shl_mul0:
772; AVX2:       # %bb.0:
773; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
774; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
775; AVX2-NEXT:    retq
776;
777; AVX512-LABEL: combine_vec_shl_mul0:
778; AVX512:       # %bb.0:
779; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
780; AVX512-NEXT:    retq
781  %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
782  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
783  ret <4 x i32> %2
784}
785
786define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
787; SSE2-LABEL: combine_vec_shl_mul1:
788; SSE2:       # %bb.0:
789; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
790; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
791; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
792; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
793; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
794; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
795; SSE2-NEXT:    retq
796;
797; SSE41-LABEL: combine_vec_shl_mul1:
798; SSE41:       # %bb.0:
799; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
800; SSE41-NEXT:    retq
801;
802; AVX-LABEL: combine_vec_shl_mul1:
803; AVX:       # %bb.0:
804; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
805; AVX-NEXT:    retq
806  %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
807  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
808  ret <4 x i32> %2
809}
810
811; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2)
812define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
813; SSE2-LABEL: combine_vec_add_shl_nonsplat:
814; SSE2:       # %bb.0:
815; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
816; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
817; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
818; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
819; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
820; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
821; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
822; SSE2-NEXT:    retq
823;
824; SSE41-LABEL: combine_vec_add_shl_nonsplat:
825; SSE41:       # %bb.0:
826; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
827; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
828; SSE41-NEXT:    retq
829;
830; AVX2-LABEL: combine_vec_add_shl_nonsplat:
831; AVX2:       # %bb.0:
832; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
833; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
834; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
835; AVX2-NEXT:    retq
836;
837; AVX512-LABEL: combine_vec_add_shl_nonsplat:
838; AVX512:       # %bb.0:
839; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
840; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
841; AVX512-NEXT:    retq
842  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
843  %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
844  ret <4 x i32> %2
845}
846
847define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
848; SSE2-LABEL: combine_vec_add_shl_and_nonsplat:
849; SSE2:       # %bb.0:
850; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
851; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4,8,16,32]
852; SSE2-NEXT:    pmuludq %xmm0, %xmm1
853; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
854; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
855; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
856; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
857; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
858; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
859; SSE2-NEXT:    movdqa %xmm1, %xmm0
860; SSE2-NEXT:    retq
861;
862; SSE41-LABEL: combine_vec_add_shl_and_nonsplat:
863; SSE41:       # %bb.0:
864; SSE41-NEXT:    pxor %xmm1, %xmm1
865; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
866; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
867; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
868; SSE41-NEXT:    retq
869;
870; AVX2-LABEL: combine_vec_add_shl_and_nonsplat:
871; AVX2:       # %bb.0:
872; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
873; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
874; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
875; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
876; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
877; AVX2-NEXT:    retq
878;
879; AVX512-LABEL: combine_vec_add_shl_and_nonsplat:
880; AVX512:       # %bb.0:
881; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
882; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
883; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
884; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
885; AVX512-NEXT:    retq
886  %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
887  %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
888  %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
889  ret <4 x i32> %3
890}
891
892define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0)  {
893; SSE2-LABEL: combine_vec_add_shuffle_shl:
894; SSE2:       # %bb.0:
895; SSE2-NEXT:    movdqa %xmm0, %xmm1
896; SSE2-NEXT:    pslld $3, %xmm1
897; SSE2-NEXT:    pslld $2, %xmm0
898; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
899; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0]
900; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
901; SSE2-NEXT:    retq
902;
903; SSE41-LABEL: combine_vec_add_shuffle_shl:
904; SSE41:       # %bb.0:
905; SSE41-NEXT:    movdqa %xmm0, %xmm1
906; SSE41-NEXT:    pslld $3, %xmm1
907; SSE41-NEXT:    pslld $2, %xmm0
908; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
909; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
910; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
911; SSE41-NEXT:    retq
912;
913; AVX2-LABEL: combine_vec_add_shuffle_shl:
914; AVX2:       # %bb.0:
915; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
916; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
917; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
918; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
919; AVX2-NEXT:    retq
920;
921; AVX512-LABEL: combine_vec_add_shuffle_shl:
922; AVX512:       # %bb.0:
923; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
924; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
925; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
926; AVX512-NEXT:    retq
927  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
928  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
929  %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
930  ret <4 x i32> %3
931}
932
933define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
934; SSE2-LABEL: combine_vec_shl_clamped1:
935; SSE2:       # %bb.0:
936; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
937; SSE2-NEXT:    pxor %xmm1, %xmm2
938; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
939; SSE2-NEXT:    pslld $23, %xmm1
940; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
941; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
942; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
943; SSE2-NEXT:    pmuludq %xmm1, %xmm0
944; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
945; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
946; SSE2-NEXT:    pmuludq %xmm3, %xmm1
947; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
948; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
949; SSE2-NEXT:    pandn %xmm0, %xmm2
950; SSE2-NEXT:    movdqa %xmm2, %xmm0
951; SSE2-NEXT:    retq
952;
953; SSE41-LABEL: combine_vec_shl_clamped1:
954; SSE41:       # %bb.0:
955; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
956; SSE41-NEXT:    pminud %xmm1, %xmm2
957; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
958; SSE41-NEXT:    pslld $23, %xmm1
959; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
960; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
961; SSE41-NEXT:    pmulld %xmm1, %xmm0
962; SSE41-NEXT:    pand %xmm2, %xmm0
963; SSE41-NEXT:    retq
964;
965; AVX-LABEL: combine_vec_shl_clamped1:
966; AVX:       # %bb.0:
967; AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
968; AVX-NEXT:    retq
969  %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
970  %shl = shl <4 x i32> %sh, %amt
971  %1 = select <4 x i1> %cmp.i, <4 x i32> %shl, <4 x i32> zeroinitializer
972  ret <4 x i32> %1
973}
974
975define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
976; SSE2-LABEL: combine_vec_shl_clamped2:
977; SSE2:       # %bb.0:
978; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
979; SSE2-NEXT:    pxor %xmm1, %xmm2
980; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
981; SSE2-NEXT:    pandn %xmm0, %xmm2
982; SSE2-NEXT:    pslld $23, %xmm1
983; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
984; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
985; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
986; SSE2-NEXT:    pmuludq %xmm1, %xmm2
987; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
988; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
989; SSE2-NEXT:    pmuludq %xmm3, %xmm1
990; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
991; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
992; SSE2-NEXT:    retq
993;
994; SSE41-LABEL: combine_vec_shl_clamped2:
995; SSE41:       # %bb.0:
996; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
997; SSE41-NEXT:    pminud %xmm1, %xmm2
998; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
999; SSE41-NEXT:    pand %xmm2, %xmm0
1000; SSE41-NEXT:    pslld $23, %xmm1
1001; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1002; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
1003; SSE41-NEXT:    pmulld %xmm1, %xmm0
1004; SSE41-NEXT:    retq
1005;
1006; AVX-LABEL: combine_vec_shl_clamped2:
1007; AVX:       # %bb.0:
1008; AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
1009; AVX-NEXT:    retq
1010  %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
1011  %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
1012  %shl = shl <4 x i32> %1, %amt
1013  ret <4 x i32> %shl
1014}
1015
1016define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
1017; SSE2-LABEL: combine_vec_shl_commuted_clamped:
1018; SSE2:       # %bb.0:
1019; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
1020; SSE2-NEXT:    pxor %xmm1, %xmm2
1021; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1022; SSE2-NEXT:    pandn %xmm0, %xmm2
1023; SSE2-NEXT:    pslld $23, %xmm1
1024; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1025; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
1026; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1027; SSE2-NEXT:    pmuludq %xmm1, %xmm2
1028; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1029; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1030; SSE2-NEXT:    pmuludq %xmm3, %xmm1
1031; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1032; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1033; SSE2-NEXT:    retq
1034;
1035; SSE41-LABEL: combine_vec_shl_commuted_clamped:
1036; SSE41:       # %bb.0:
1037; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
1038; SSE41-NEXT:    pminud %xmm1, %xmm2
1039; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
1040; SSE41-NEXT:    pand %xmm2, %xmm0
1041; SSE41-NEXT:    pslld $23, %xmm1
1042; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1043; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
1044; SSE41-NEXT:    pmulld %xmm1, %xmm0
1045; SSE41-NEXT:    retq
1046;
1047; AVX-LABEL: combine_vec_shl_commuted_clamped:
1048; AVX:       # %bb.0:
1049; AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
1050; AVX-NEXT:    retq
1051  %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
1052  %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
1053  %shl = shl <4 x i32> %1, %amt
1054  ret <4 x i32> %shl
1055}
1056
1057define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
1058; SSE2-LABEL: combine_vec_shl_commuted_clamped1:
1059; SSE2:       # %bb.0:
1060; SSE2-NEXT:    movdqa %xmm1, %xmm2
1061; SSE2-NEXT:    pslld $23, %xmm2
1062; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1063; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
1064; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1065; SSE2-NEXT:    pmuludq %xmm2, %xmm0
1066; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1067; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1068; SSE2-NEXT:    pmuludq %xmm3, %xmm2
1069; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1070; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1071; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1072; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1073; SSE2-NEXT:    pandn %xmm0, %xmm1
1074; SSE2-NEXT:    movdqa %xmm1, %xmm0
1075; SSE2-NEXT:    retq
1076;
1077; SSE41-LABEL: combine_vec_shl_commuted_clamped1:
1078; SSE41:       # %bb.0:
1079; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
1080; SSE41-NEXT:    pminud %xmm1, %xmm2
1081; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
1082; SSE41-NEXT:    pslld $23, %xmm1
1083; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1084; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
1085; SSE41-NEXT:    pmulld %xmm1, %xmm0
1086; SSE41-NEXT:    pand %xmm2, %xmm0
1087; SSE41-NEXT:    retq
1088;
1089; AVX-LABEL: combine_vec_shl_commuted_clamped1:
1090; AVX:       # %bb.0:
1091; AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
1092; AVX-NEXT:    retq
1093  %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
1094  %shl = shl <4 x i32> %sh, %amt
1095  %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shl
1096  ret <4 x i32> %1
1097}
1098