xref: /llvm-project/llvm/test/CodeGen/X86/combine-rotates.ll (revision 44e997a158610f99789f4d51e7e89e2cbadb9047)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -early-live-intervals | FileCheck %s --check-prefixes=CHECK,SSE2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
7
8; fold (rot (rot x, c1), c2) -> rot x, c1+c2
9define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
10; SSE2-LABEL: combine_vec_rot_rot:
11; SSE2:       # %bb.0:
12; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
13; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
14; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
15; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
16; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
17; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
18; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
19; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
20; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
21; SSE2-NEXT:    por %xmm2, %xmm0
22; SSE2-NEXT:    retq
23;
24; XOP-LABEL: combine_vec_rot_rot:
25; XOP:       # %bb.0:
26; XOP-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
27; XOP-NEXT:    retq
28;
29; AVX2-LABEL: combine_vec_rot_rot:
30; AVX2:       # %bb.0:
31; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
32; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
33; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
34; AVX2-NEXT:    retq
35;
36; AVX512-LABEL: combine_vec_rot_rot:
37; AVX512:       # %bb.0:
38; AVX512-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
39; AVX512-NEXT:    retq
40  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
41  %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
42  %3 = or <4 x i32> %1, %2
43  %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15>
44  %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17>
45  %6 = or <4 x i32> %4, %5
46  ret <4 x i32> %6
47}
48
49define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
50; SSE2-LABEL: combine_vec_rot_rot_splat:
51; SSE2:       # %bb.0:
52; SSE2-NEXT:    movdqa %xmm0, %xmm1
53; SSE2-NEXT:    psrld $25, %xmm1
54; SSE2-NEXT:    pslld $7, %xmm0
55; SSE2-NEXT:    por %xmm1, %xmm0
56; SSE2-NEXT:    retq
57;
58; XOP-LABEL: combine_vec_rot_rot_splat:
59; XOP:       # %bb.0:
60; XOP-NEXT:    vprotd $7, %xmm0, %xmm0
61; XOP-NEXT:    retq
62;
63; AVX2-LABEL: combine_vec_rot_rot_splat:
64; AVX2:       # %bb.0:
65; AVX2-NEXT:    vpsrld $25, %xmm0, %xmm1
66; AVX2-NEXT:    vpslld $7, %xmm0, %xmm0
67; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
68; AVX2-NEXT:    retq
69;
70; AVX512-LABEL: combine_vec_rot_rot_splat:
71; AVX512:       # %bb.0:
72; AVX512-NEXT:    vprold $7, %xmm0, %xmm0
73; AVX512-NEXT:    retq
74  %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
75  %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
76  %3 = or <4 x i32> %1, %2
77  %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22>
78  %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10>
79  %6 = or <4 x i32> %4, %5
80  ret <4 x i32> %6
81}
82
83define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
84; CHECK-LABEL: combine_vec_rot_rot_splat_zero:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    retq
87  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
88  %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
89  %3 = or <4 x i32> %1, %2
90  %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31>
91  %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
92  %6 = or <4 x i32> %4, %5
93  ret <4 x i32> %6
94}
95
96; TODO - fold (select (icmp eq c, 0), x, (rot x, c)) -> rot x, c
97define i32 @combine_rot_select_zero(i32, i32) {
98; CHECK-LABEL: combine_rot_select_zero:
99; CHECK:       # %bb.0:
100; CHECK-NEXT:    movl %esi, %ecx
101; CHECK-NEXT:    movl %edi, %eax
102; CHECK-NEXT:    roll %cl, %eax
103; CHECK-NEXT:    testl %esi, %esi
104; CHECK-NEXT:    cmovel %edi, %eax
105; CHECK-NEXT:    retq
106  %3 = and i32 %1, 31
107  %4 = shl i32 %0, %3
108  %5 = sub i32 0, %1
109  %6 = and i32 %5, 31
110  %7 = lshr i32 %0, %6
111  %8 = or i32 %4, %7
112  %9 = icmp eq i32 %1, 0
113  %10 = select i1 %9, i32 %0, i32 %8
114  ret i32 %10
115}
116
117define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) {
118; SSE2-LABEL: combine_vec_rot_select_zero:
119; SSE2:       # %bb.0:
120; SSE2-NEXT:    pxor %xmm2, %xmm2
121; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
122; SSE2-NEXT:    pslld $23, %xmm1
123; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
124; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
125; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
126; SSE2-NEXT:    movdqa %xmm0, %xmm3
127; SSE2-NEXT:    pmuludq %xmm1, %xmm3
128; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
129; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
130; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
131; SSE2-NEXT:    pmuludq %xmm5, %xmm1
132; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
133; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
134; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
135; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
136; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
137; SSE2-NEXT:    por %xmm4, %xmm3
138; SSE2-NEXT:    pand %xmm2, %xmm0
139; SSE2-NEXT:    pandn %xmm3, %xmm2
140; SSE2-NEXT:    por %xmm2, %xmm0
141; SSE2-NEXT:    retq
142;
143; XOP-LABEL: combine_vec_rot_select_zero:
144; XOP:       # %bb.0:
145; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
146; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm3
147; XOP-NEXT:    vpcomeqd %xmm2, %xmm1, %xmm1
148; XOP-NEXT:    vblendvps %xmm1, %xmm0, %xmm3, %xmm0
149; XOP-NEXT:    retq
150;
151; AVX2-LABEL: combine_vec_rot_select_zero:
152; AVX2:       # %bb.0:
153; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
154; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
155; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm3
156; AVX2-NEXT:    vpsllvd %xmm3, %xmm0, %xmm4
157; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32]
158; AVX2-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
159; AVX2-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
160; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
161; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
162; AVX2-NEXT:    vblendvps %xmm1, %xmm0, %xmm3, %xmm0
163; AVX2-NEXT:    retq
164;
165; AVX512-LABEL: combine_vec_rot_select_zero:
166; AVX512:       # %bb.0:
167; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
168; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1}
169; AVX512-NEXT:    retq
170  %3 = and <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
171  %4 = shl <4 x i32> %0, %3
172  %5 = sub <4 x i32> zeroinitializer, %1
173  %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31>
174  %7 = lshr <4 x i32> %0, %6
175  %8 = or <4 x i32> %4, %7
176  %9 = icmp eq <4 x i32> %1, zeroinitializer
177  %10 = select <4 x i1> %9, <4 x i32> %0, <4 x i32> %8
178  ret <4 x i32> %10
179}
180
181define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) {
182; SSE2-LABEL: rotate_demanded_bits:
183; SSE2:       # %bb.0:
184; SSE2-NEXT:    pslld $23, %xmm1
185; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
186; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
187; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
188; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
189; SSE2-NEXT:    pmuludq %xmm1, %xmm0
190; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
191; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
192; SSE2-NEXT:    pmuludq %xmm2, %xmm1
193; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
194; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
195; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
196; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
197; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
198; SSE2-NEXT:    por %xmm3, %xmm0
199; SSE2-NEXT:    retq
200;
201; XOP-LABEL: rotate_demanded_bits:
202; XOP:       # %bb.0:
203; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
204; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
205; XOP-NEXT:    retq
206;
207; AVX2-LABEL: rotate_demanded_bits:
208; AVX2:       # %bb.0:
209; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30]
210; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
211; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
212; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
213; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
214; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
215; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
216; AVX2-NEXT:    retq
217;
218; AVX512-LABEL: rotate_demanded_bits:
219; AVX512:       # %bb.0:
220; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
221; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
222; AVX512-NEXT:    retq
223  %3 = and <4 x i32> %1, <i32 30, i32 30, i32 30, i32 30>
224  %4 = shl <4 x i32> %0, %3
225  %5 = sub nsw <4 x i32> zeroinitializer, %3
226  %6 = and <4 x i32> %5, <i32 30, i32 30, i32 30, i32 30>
227  %7 = lshr <4 x i32> %0, %6
228  %8 = or <4 x i32> %7, %4
229  ret <4 x i32> %8
230}
231
232define <4 x i32> @rotate_demanded_bits_2(<4 x i32>, <4 x i32>) {
233; SSE2-LABEL: rotate_demanded_bits_2:
234; SSE2:       # %bb.0:
235; SSE2-NEXT:    pslld $23, %xmm1
236; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
237; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
238; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
239; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
240; SSE2-NEXT:    pmuludq %xmm1, %xmm0
241; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
242; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
243; SSE2-NEXT:    pmuludq %xmm2, %xmm1
244; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
245; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
246; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
247; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
248; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
249; SSE2-NEXT:    por %xmm3, %xmm0
250; SSE2-NEXT:    retq
251;
252; XOP-LABEL: rotate_demanded_bits_2:
253; XOP:       # %bb.0:
254; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
255; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
256; XOP-NEXT:    retq
257;
258; AVX2-LABEL: rotate_demanded_bits_2:
259; AVX2:       # %bb.0:
260; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [23,23,23,23]
261; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
262; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
263; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
264; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
265; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
266; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
267; AVX2-NEXT:    retq
268;
269; AVX512-LABEL: rotate_demanded_bits_2:
270; AVX512:       # %bb.0:
271; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
272; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
273; AVX512-NEXT:    retq
274  %3 = and <4 x i32> %1, <i32 23, i32 23, i32 23, i32 23>
275  %4 = shl <4 x i32> %0, %3
276  %5 = sub nsw <4 x i32> zeroinitializer, %3
277  %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31>
278  %7 = lshr <4 x i32> %0, %6
279  %8 = or <4 x i32> %7, %4
280  ret <4 x i32> %8
281}
282
283define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) {
284; SSE2-LABEL: rotate_demanded_bits_3:
285; SSE2:       # %bb.0:
286; SSE2-NEXT:    pslld $24, %xmm1
287; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
288; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
289; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
290; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
291; SSE2-NEXT:    pmuludq %xmm1, %xmm0
292; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
293; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
294; SSE2-NEXT:    pmuludq %xmm2, %xmm1
295; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
296; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
297; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
298; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
299; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
300; SSE2-NEXT:    por %xmm3, %xmm0
301; SSE2-NEXT:    retq
302;
303; XOP-LABEL: rotate_demanded_bits_3:
304; XOP:       # %bb.0:
305; XOP-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
306; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
307; XOP-NEXT:    retq
308;
309; AVX2-LABEL: rotate_demanded_bits_3:
310; AVX2:       # %bb.0:
311; AVX2-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
312; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
313; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
314; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
315; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
316; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
317; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
318; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
319; AVX2-NEXT:    retq
320;
321; AVX512-LABEL: rotate_demanded_bits_3:
322; AVX512:       # %bb.0:
323; AVX512-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
324; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
325; AVX512-NEXT:    retq
326  %3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
327  %4 = and <4 x i32> %3, <i32 30, i32 30, i32 30, i32 30>
328  %5 = shl <4 x i32> %0, %4
329  %6 = sub <4 x i32> zeroinitializer, %3
330  %7 = and <4 x i32> %6, <i32 30, i32 30, i32 30, i32 30>
331  %8 = lshr <4 x i32> %0, %7
332  %9 = or <4 x i32> %5, %8
333  ret <4 x i32> %9
334}
335
336define <4 x i32> @rotl_binop_shuffle(<4 x i32>, <4 x i32>) {
337; SSE2-LABEL: rotl_binop_shuffle:
338; SSE2:       # %bb.0:
339; SSE2-NEXT:    pslld $23, %xmm1
340; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
341; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
342; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
343; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
344; SSE2-NEXT:    pmuludq %xmm1, %xmm0
345; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
346; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
347; SSE2-NEXT:    pmuludq %xmm2, %xmm1
348; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
349; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
350; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
351; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
352; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
353; SSE2-NEXT:    por %xmm3, %xmm0
354; SSE2-NEXT:    retq
355;
356; XOP-LABEL: rotl_binop_shuffle:
357; XOP:       # %bb.0:
358; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
359; XOP-NEXT:    retq
360;
361; AVX2-LABEL: rotl_binop_shuffle:
362; AVX2:       # %bb.0:
363; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
364; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
365; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
366; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
367; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
368; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
369; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
370; AVX2-NEXT:    retq
371;
372; AVX512-LABEL: rotl_binop_shuffle:
373; AVX512:       # %bb.0:
374; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
375; AVX512-NEXT:    retq
376  %3 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
377  %4 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
378  %5 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %3, <4 x i32> %3, <4 x i32> %4)
379  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
380  ret <4 x i32> %6
381}
382
383define <4 x i32> @rotr_binop_shuffle(<4 x i32>, <4 x i32>) {
384; SSE2-LABEL: rotr_binop_shuffle:
385; SSE2:       # %bb.0:
386; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
387; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
388; SSE2-NEXT:    psllq %xmm1, %xmm2
389; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
390; SSE2-NEXT:    psllq %xmm1, %xmm0
391; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
392; SSE2-NEXT:    retq
393;
394; XOP-LABEL: rotr_binop_shuffle:
395; XOP:       # %bb.0:
396; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
397; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
398; XOP-NEXT:    retq
399;
400; AVX2-LABEL: rotr_binop_shuffle:
401; AVX2:       # %bb.0:
402; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
403; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
404; AVX2-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
405; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
406; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
407; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
408; AVX2-NEXT:    retq
409;
410; AVX512-LABEL: rotr_binop_shuffle:
411; AVX512:       # %bb.0:
412; AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
413; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
414; AVX512-NEXT:    retq
415  %3 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
416  %4 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
417  %5 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %3, <4 x i32> %3, <4 x i32> %4)
418  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
419  ret <4 x i32> %6
420}
421
422; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9935
423define i32 @fuzz9935() {
424; CHECK-LABEL: fuzz9935:
425; CHECK:       # %bb.0:
426; CHECK-NEXT:    movl $-1, %eax
427; CHECK-NEXT:    retq
428  %1 = trunc i40 549755813887 to i32
429  %2 = mul i32 %1, %1
430  %3 = lshr i32 %2, %1
431  %4 = or i32 %3, %2
432  ret i32 %4
433}
434
435; Ensure we normalize the inner rotation before adding the results.
436define i5 @rotl_merge_i5(i5 %x) {
437; CHECK-LABEL: rotl_merge_i5:
438; CHECK:       # %bb.0:
439; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
440; CHECK-NEXT:    leal (,%rdi,4), %ecx
441; CHECK-NEXT:    movl %edi, %eax
442; CHECK-NEXT:    andb $24, %al
443; CHECK-NEXT:    shrb $3, %al
444; CHECK-NEXT:    orb %cl, %al
445; CHECK-NEXT:    retq
446  %r1 = call i5 @llvm.fshl.i5(i5 %x, i5 %x, i5 -1)
447  %r2 = call i5 @llvm.fshl.i5(i5 %r1, i5 %r1, i5 1)
448  ret i5 %r2
449}
450declare i5 @llvm.fshl.i5(i5, i5, i5)
451
452declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
453declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
454