1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -early-live-intervals | FileCheck %s --check-prefixes=CHECK,SSE2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 7 8; fold (rot (rot x, c1), c2) -> rot x, c1+c2 9define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { 10; SSE2-LABEL: combine_vec_rot_rot: 11; SSE2: # %bb.0: 12; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 13; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 14; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] 15; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 16; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] 17; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 18; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 19; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 20; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 21; SSE2-NEXT: por %xmm2, %xmm0 22; SSE2-NEXT: retq 23; 24; XOP-LABEL: combine_vec_rot_rot: 25; XOP: # %bb.0: 26; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 27; XOP-NEXT: retq 28; 29; AVX2-LABEL: combine_vec_rot_rot: 30; AVX2: # %bb.0: 31; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 32; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 33; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 34; AVX2-NEXT: retq 35; 36; AVX512-LABEL: combine_vec_rot_rot: 37; AVX512: # %bb.0: 38; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 39; AVX512-NEXT: retq 40 %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> 41 %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> 42 %3 = or <4 x i32> %1, %2 43 %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15> 44 %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17> 45 %6 = or <4 x i32> %4, %5 46 ret <4 x i32> %6 47} 48 49define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) { 50; SSE2-LABEL: combine_vec_rot_rot_splat: 51; SSE2: # %bb.0: 52; SSE2-NEXT: movdqa %xmm0, %xmm1 53; SSE2-NEXT: psrld $25, %xmm1 54; SSE2-NEXT: pslld $7, %xmm0 55; SSE2-NEXT: por %xmm1, %xmm0 56; SSE2-NEXT: retq 57; 58; XOP-LABEL: combine_vec_rot_rot_splat: 59; XOP: # %bb.0: 60; XOP-NEXT: vprotd $7, %xmm0, %xmm0 61; XOP-NEXT: retq 62; 63; AVX2-LABEL: combine_vec_rot_rot_splat: 64; AVX2: # %bb.0: 65; AVX2-NEXT: vpsrld $25, %xmm0, %xmm1 66; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 67; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 68; AVX2-NEXT: retq 69; 70; AVX512-LABEL: combine_vec_rot_rot_splat: 71; AVX512: # %bb.0: 72; AVX512-NEXT: vprold $7, %xmm0, %xmm0 73; AVX512-NEXT: retq 74 %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 75 %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29> 76 %3 = or <4 x i32> %1, %2 77 %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22> 78 %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10> 79 %6 = or <4 x i32> %4, %5 80 ret <4 x i32> %6 81} 82 83define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) { 84; CHECK-LABEL: combine_vec_rot_rot_splat_zero: 85; CHECK: # %bb.0: 86; CHECK-NEXT: retq 87 %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 88 %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> 89 %3 = or <4 x i32> %1, %2 90 %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31> 91 %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 92 %6 = or <4 x i32> %4, %5 93 ret <4 x i32> %6 94} 95 96; TODO - fold (select (icmp eq c, 0), x, (rot x, c)) -> rot x, c 97define i32 @combine_rot_select_zero(i32, i32) { 98; CHECK-LABEL: combine_rot_select_zero: 99; CHECK: # %bb.0: 100; CHECK-NEXT: movl %esi, %ecx 101; CHECK-NEXT: movl %edi, %eax 102; CHECK-NEXT: roll %cl, %eax 103; CHECK-NEXT: testl %esi, %esi 104; CHECK-NEXT: cmovel %edi, %eax 105; CHECK-NEXT: retq 106 %3 = and i32 %1, 31 107 %4 = shl i32 %0, %3 108 %5 = sub i32 0, %1 109 %6 = and i32 %5, 31 110 %7 = lshr i32 %0, %6 111 %8 = or i32 %4, %7 112 %9 = icmp eq i32 %1, 0 113 %10 = select i1 %9, i32 %0, i32 %8 114 ret i32 %10 115} 116 117define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { 118; SSE2-LABEL: combine_vec_rot_select_zero: 119; SSE2: # %bb.0: 120; SSE2-NEXT: pxor %xmm2, %xmm2 121; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 122; SSE2-NEXT: pslld $23, %xmm1 123; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 124; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 125; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 126; SSE2-NEXT: movdqa %xmm0, %xmm3 127; SSE2-NEXT: pmuludq %xmm1, %xmm3 128; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] 129; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 130; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 131; SSE2-NEXT: pmuludq %xmm5, %xmm1 132; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] 133; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 134; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 135; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 136; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 137; SSE2-NEXT: por %xmm4, %xmm3 138; SSE2-NEXT: pand %xmm2, %xmm0 139; SSE2-NEXT: pandn %xmm3, %xmm2 140; SSE2-NEXT: por %xmm2, %xmm0 141; SSE2-NEXT: retq 142; 143; XOP-LABEL: combine_vec_rot_select_zero: 144; XOP: # %bb.0: 145; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 146; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm3 147; XOP-NEXT: vpcomeqd %xmm2, %xmm1, %xmm1 148; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 149; XOP-NEXT: retq 150; 151; AVX2-LABEL: combine_vec_rot_select_zero: 152; AVX2: # %bb.0: 153; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 154; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 155; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3 156; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4 157; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32] 158; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3 159; AVX2-NEXT: vpsrlvd %xmm3, %xmm0, %xmm3 160; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 161; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 162; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 163; AVX2-NEXT: retq 164; 165; AVX512-LABEL: combine_vec_rot_select_zero: 166; AVX512: # %bb.0: 167; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 168; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} 169; AVX512-NEXT: retq 170 %3 = and <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 171 %4 = shl <4 x i32> %0, %3 172 %5 = sub <4 x i32> zeroinitializer, %1 173 %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31> 174 %7 = lshr <4 x i32> %0, %6 175 %8 = or <4 x i32> %4, %7 176 %9 = icmp eq <4 x i32> %1, zeroinitializer 177 %10 = select <4 x i1> %9, <4 x i32> %0, <4 x i32> %8 178 ret <4 x i32> %10 179} 180 181define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) { 182; SSE2-LABEL: rotate_demanded_bits: 183; SSE2: # %bb.0: 184; SSE2-NEXT: pslld $23, %xmm1 185; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 186; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 187; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 188; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 189; SSE2-NEXT: pmuludq %xmm1, %xmm0 190; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 191; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 192; SSE2-NEXT: pmuludq %xmm2, %xmm1 193; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 194; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 195; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 196; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 197; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 198; SSE2-NEXT: por %xmm3, %xmm0 199; SSE2-NEXT: retq 200; 201; XOP-LABEL: rotate_demanded_bits: 202; XOP: # %bb.0: 203; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 204; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 205; XOP-NEXT: retq 206; 207; AVX2-LABEL: rotate_demanded_bits: 208; AVX2: # %bb.0: 209; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] 210; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 211; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 212; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 213; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 214; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 215; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 216; AVX2-NEXT: retq 217; 218; AVX512-LABEL: rotate_demanded_bits: 219; AVX512: # %bb.0: 220; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 221; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 222; AVX512-NEXT: retq 223 %3 = and <4 x i32> %1, <i32 30, i32 30, i32 30, i32 30> 224 %4 = shl <4 x i32> %0, %3 225 %5 = sub nsw <4 x i32> zeroinitializer, %3 226 %6 = and <4 x i32> %5, <i32 30, i32 30, i32 30, i32 30> 227 %7 = lshr <4 x i32> %0, %6 228 %8 = or <4 x i32> %7, %4 229 ret <4 x i32> %8 230} 231 232define <4 x i32> @rotate_demanded_bits_2(<4 x i32>, <4 x i32>) { 233; SSE2-LABEL: rotate_demanded_bits_2: 234; SSE2: # %bb.0: 235; SSE2-NEXT: pslld $23, %xmm1 236; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 237; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 238; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 239; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 240; SSE2-NEXT: pmuludq %xmm1, %xmm0 241; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 242; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 243; SSE2-NEXT: pmuludq %xmm2, %xmm1 244; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 245; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 246; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 247; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 248; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 249; SSE2-NEXT: por %xmm3, %xmm0 250; SSE2-NEXT: retq 251; 252; XOP-LABEL: rotate_demanded_bits_2: 253; XOP: # %bb.0: 254; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 255; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 256; XOP-NEXT: retq 257; 258; AVX2-LABEL: rotate_demanded_bits_2: 259; AVX2: # %bb.0: 260; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [23,23,23,23] 261; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 262; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 263; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 264; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 265; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 266; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 267; AVX2-NEXT: retq 268; 269; AVX512-LABEL: rotate_demanded_bits_2: 270; AVX512: # %bb.0: 271; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 272; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 273; AVX512-NEXT: retq 274 %3 = and <4 x i32> %1, <i32 23, i32 23, i32 23, i32 23> 275 %4 = shl <4 x i32> %0, %3 276 %5 = sub nsw <4 x i32> zeroinitializer, %3 277 %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31> 278 %7 = lshr <4 x i32> %0, %6 279 %8 = or <4 x i32> %7, %4 280 ret <4 x i32> %8 281} 282 283define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) { 284; SSE2-LABEL: rotate_demanded_bits_3: 285; SSE2: # %bb.0: 286; SSE2-NEXT: pslld $24, %xmm1 287; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 288; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 289; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 290; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 291; SSE2-NEXT: pmuludq %xmm1, %xmm0 292; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 293; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 294; SSE2-NEXT: pmuludq %xmm2, %xmm1 295; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 296; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 297; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 298; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 299; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 300; SSE2-NEXT: por %xmm3, %xmm0 301; SSE2-NEXT: retq 302; 303; XOP-LABEL: rotate_demanded_bits_3: 304; XOP: # %bb.0: 305; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1 306; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 307; XOP-NEXT: retq 308; 309; AVX2-LABEL: rotate_demanded_bits_3: 310; AVX2: # %bb.0: 311; AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm1 312; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 313; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 314; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 315; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 316; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 317; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 318; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 319; AVX2-NEXT: retq 320; 321; AVX512-LABEL: rotate_demanded_bits_3: 322; AVX512: # %bb.0: 323; AVX512-NEXT: vpaddd %xmm1, %xmm1, %xmm1 324; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 325; AVX512-NEXT: retq 326 %3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1> 327 %4 = and <4 x i32> %3, <i32 30, i32 30, i32 30, i32 30> 328 %5 = shl <4 x i32> %0, %4 329 %6 = sub <4 x i32> zeroinitializer, %3 330 %7 = and <4 x i32> %6, <i32 30, i32 30, i32 30, i32 30> 331 %8 = lshr <4 x i32> %0, %7 332 %9 = or <4 x i32> %5, %8 333 ret <4 x i32> %9 334} 335 336define <4 x i32> @rotl_binop_shuffle(<4 x i32>, <4 x i32>) { 337; SSE2-LABEL: rotl_binop_shuffle: 338; SSE2: # %bb.0: 339; SSE2-NEXT: pslld $23, %xmm1 340; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 341; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 342; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 343; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 344; SSE2-NEXT: pmuludq %xmm1, %xmm0 345; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 346; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 347; SSE2-NEXT: pmuludq %xmm2, %xmm1 348; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 349; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 350; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 351; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 352; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 353; SSE2-NEXT: por %xmm3, %xmm0 354; SSE2-NEXT: retq 355; 356; XOP-LABEL: rotl_binop_shuffle: 357; XOP: # %bb.0: 358; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 359; XOP-NEXT: retq 360; 361; AVX2-LABEL: rotl_binop_shuffle: 362; AVX2: # %bb.0: 363; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 364; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 365; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 366; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 367; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 368; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 369; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 370; AVX2-NEXT: retq 371; 372; AVX512-LABEL: rotl_binop_shuffle: 373; AVX512: # %bb.0: 374; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 375; AVX512-NEXT: retq 376 %3 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 377 %4 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 378 %5 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %3, <4 x i32> %3, <4 x i32> %4) 379 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 380 ret <4 x i32> %6 381} 382 383define <4 x i32> @rotr_binop_shuffle(<4 x i32>, <4 x i32>) { 384; SSE2-LABEL: rotr_binop_shuffle: 385; SSE2: # %bb.0: 386; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 387; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] 388; SSE2-NEXT: psllq %xmm1, %xmm2 389; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 390; SSE2-NEXT: psllq %xmm1, %xmm0 391; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 392; SSE2-NEXT: retq 393; 394; XOP-LABEL: rotr_binop_shuffle: 395; XOP: # %bb.0: 396; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 397; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 398; XOP-NEXT: retq 399; 400; AVX2-LABEL: rotr_binop_shuffle: 401; AVX2: # %bb.0: 402; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 403; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] 404; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2 405; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 406; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 407; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 408; AVX2-NEXT: retq 409; 410; AVX512-LABEL: rotr_binop_shuffle: 411; AVX512: # %bb.0: 412; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 413; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 414; AVX512-NEXT: retq 415 %3 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 416 %4 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer 417 %5 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %3, <4 x i32> %3, <4 x i32> %4) 418 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 419 ret <4 x i32> %6 420} 421 422; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9935 423define i32 @fuzz9935() { 424; CHECK-LABEL: fuzz9935: 425; CHECK: # %bb.0: 426; CHECK-NEXT: movl $-1, %eax 427; CHECK-NEXT: retq 428 %1 = trunc i40 549755813887 to i32 429 %2 = mul i32 %1, %1 430 %3 = lshr i32 %2, %1 431 %4 = or i32 %3, %2 432 ret i32 %4 433} 434 435; Ensure we normalize the inner rotation before adding the results. 436define i5 @rotl_merge_i5(i5 %x) { 437; CHECK-LABEL: rotl_merge_i5: 438; CHECK: # %bb.0: 439; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 440; CHECK-NEXT: leal (,%rdi,4), %ecx 441; CHECK-NEXT: movl %edi, %eax 442; CHECK-NEXT: andb $24, %al 443; CHECK-NEXT: shrb $3, %al 444; CHECK-NEXT: orb %cl, %al 445; CHECK-NEXT: retq 446 %r1 = call i5 @llvm.fshl.i5(i5 %x, i5 %x, i5 -1) 447 %r2 = call i5 @llvm.fshl.i5(i5 %r1, i5 %r1, i5 1) 448 ret i5 %r2 449} 450declare i5 @llvm.fshl.i5(i5, i5, i5) 451 452declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 453declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 454