1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 5 6; fold (urem x, 1) -> 0 7define i32 @combine_urem_by_one(i32 %x) { 8; CHECK-LABEL: combine_urem_by_one: 9; CHECK: # %bb.0: 10; CHECK-NEXT: xorl %eax, %eax 11; CHECK-NEXT: retq 12 %1 = urem i32 %x, 1 13 ret i32 %1 14} 15 16define <4 x i32> @combine_vec_urem_by_one(<4 x i32> %x) { 17; SSE-LABEL: combine_vec_urem_by_one: 18; SSE: # %bb.0: 19; SSE-NEXT: xorps %xmm0, %xmm0 20; SSE-NEXT: retq 21; 22; AVX-LABEL: combine_vec_urem_by_one: 23; AVX: # %bb.0: 24; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 25; AVX-NEXT: retq 26 %1 = urem <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 27 ret <4 x i32> %1 28} 29 30; fold (urem x, -1) -> select((icmp eq x, -1), 0, x) 31define i32 @combine_urem_by_negone(i32 %x) { 32; CHECK-LABEL: combine_urem_by_negone: 33; CHECK: # %bb.0: 34; CHECK-NEXT: xorl %eax, %eax 35; CHECK-NEXT: cmpl $-1, %edi 36; CHECK-NEXT: cmovnel %edi, %eax 37; CHECK-NEXT: retq 38 %1 = urem i32 %x, -1 39 ret i32 %1 40} 41 42define <4 x i32> @combine_vec_urem_by_negone(<4 x i32> %x) { 43; SSE-LABEL: combine_vec_urem_by_negone: 44; SSE: # %bb.0: 45; SSE-NEXT: pcmpeqd %xmm1, %xmm1 46; SSE-NEXT: pcmpeqd %xmm0, %xmm1 47; SSE-NEXT: pandn %xmm0, %xmm1 48; SSE-NEXT: movdqa %xmm1, %xmm0 49; SSE-NEXT: retq 50; 51; AVX-LABEL: combine_vec_urem_by_negone: 52; AVX: # %bb.0: 53; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 54; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 55; AVX-NEXT: vpandn %xmm0, %xmm1, %xmm0 56; AVX-NEXT: retq 57 %1 = urem <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 58 ret <4 x i32> %1 59} 60 61; Use PSLLI intrinsic to postpone the undef creation until after urem-by-constant expansion 62 63define <4 x i32> @combine_vec_urem_undef_by_negone(<4 x i32> %in) { 64; SSE-LABEL: combine_vec_urem_undef_by_negone: 65; SSE: # %bb.0: 66; SSE-NEXT: pcmpeqd %xmm0, %xmm0 67; SSE-NEXT: pcmpeqd %xmm0, %xmm0 68; SSE-NEXT: pandn %xmm0, %xmm0 69; SSE-NEXT: retq 70; 71; AVX-LABEL: combine_vec_urem_undef_by_negone: 72; AVX: # %bb.0: 73; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 74; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 75; AVX-NEXT: vpandn %xmm0, %xmm0, %xmm0 76; AVX-NEXT: retq 77 %x = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 0) 78 %y = urem <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 79 ret <4 x i32> %y 80} 81 82; fold (urem x, INT_MIN) -> (and x, ~INT_MIN) 83define i32 @combine_urem_by_minsigned(i32 %x) { 84; CHECK-LABEL: combine_urem_by_minsigned: 85; CHECK: # %bb.0: 86; CHECK-NEXT: movl %edi, %eax 87; CHECK-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF 88; CHECK-NEXT: retq 89 %1 = urem i32 %x, -2147483648 90 ret i32 %1 91} 92 93define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) { 94; SSE-LABEL: combine_vec_urem_by_minsigned: 95; SSE: # %bb.0: 96; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 97; SSE-NEXT: retq 98; 99; AVX1-LABEL: combine_vec_urem_by_minsigned: 100; AVX1: # %bb.0: 101; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 102; AVX1-NEXT: retq 103; 104; AVX2-LABEL: combine_vec_urem_by_minsigned: 105; AVX2: # %bb.0: 106; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] 107; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 108; AVX2-NEXT: retq 109 %1 = urem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 110 ret <4 x i32> %1 111} 112 113; fold (urem 0, x) -> 0 114define i32 @combine_urem_zero(i32 %x) { 115; CHECK-LABEL: combine_urem_zero: 116; CHECK: # %bb.0: 117; CHECK-NEXT: xorl %eax, %eax 118; CHECK-NEXT: retq 119 %1 = urem i32 0, %x 120 ret i32 %1 121} 122 123define <4 x i32> @combine_vec_urem_zero(<4 x i32> %x) { 124; SSE-LABEL: combine_vec_urem_zero: 125; SSE: # %bb.0: 126; SSE-NEXT: xorps %xmm0, %xmm0 127; SSE-NEXT: retq 128; 129; AVX-LABEL: combine_vec_urem_zero: 130; AVX: # %bb.0: 131; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 132; AVX-NEXT: retq 133 %1 = urem <4 x i32> zeroinitializer, %x 134 ret <4 x i32> %1 135} 136 137; fold (urem x, x) -> 0 138define i32 @combine_urem_dupe(i32 %x) { 139; CHECK-LABEL: combine_urem_dupe: 140; CHECK: # %bb.0: 141; CHECK-NEXT: xorl %eax, %eax 142; CHECK-NEXT: retq 143 %1 = urem i32 %x, %x 144 ret i32 %1 145} 146 147define <4 x i32> @combine_vec_urem_dupe(<4 x i32> %x) { 148; SSE-LABEL: combine_vec_urem_dupe: 149; SSE: # %bb.0: 150; SSE-NEXT: xorps %xmm0, %xmm0 151; SSE-NEXT: retq 152; 153; AVX-LABEL: combine_vec_urem_dupe: 154; AVX: # %bb.0: 155; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 156; AVX-NEXT: retq 157 %1 = urem <4 x i32> %x, %x 158 ret <4 x i32> %1 159} 160 161; fold (urem x, pow2) -> (and x, (pow2-1)) 162define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) { 163; SSE-LABEL: combine_vec_urem_by_pow2a: 164; SSE: # %bb.0: 165; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 166; SSE-NEXT: retq 167; 168; AVX1-LABEL: combine_vec_urem_by_pow2a: 169; AVX1: # %bb.0: 170; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 171; AVX1-NEXT: retq 172; 173; AVX2-LABEL: combine_vec_urem_by_pow2a: 174; AVX2: # %bb.0: 175; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] 176; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 177; AVX2-NEXT: retq 178 %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 179 ret <4 x i32> %1 180} 181 182define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { 183; SSE-LABEL: combine_vec_urem_by_pow2b: 184; SSE: # %bb.0: 185; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 186; SSE-NEXT: retq 187; 188; AVX-LABEL: combine_vec_urem_by_pow2b: 189; AVX: # %bb.0: 190; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 191; AVX-NEXT: retq 192 %1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 193 ret <4 x i32> %1 194} 195 196define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { 197; SSE-LABEL: combine_vec_urem_by_pow2c: 198; SSE: # %bb.0: 199; SSE-NEXT: pslld $23, %xmm1 200; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 201; SSE-NEXT: cvttps2dq %xmm1, %xmm1 202; SSE-NEXT: pcmpeqd %xmm2, %xmm2 203; SSE-NEXT: paddd %xmm1, %xmm2 204; SSE-NEXT: pand %xmm2, %xmm0 205; SSE-NEXT: retq 206; 207; AVX1-LABEL: combine_vec_urem_by_pow2c: 208; AVX1: # %bb.0: 209; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 210; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 211; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 212; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 213; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 214; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 215; AVX1-NEXT: retq 216; 217; AVX2-LABEL: combine_vec_urem_by_pow2c: 218; AVX2: # %bb.0: 219; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] 220; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 221; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 222; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 223; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 224; AVX2-NEXT: retq 225 %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y 226 %2 = urem <4 x i32> %x, %1 227 ret <4 x i32> %2 228} 229 230define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { 231; SSE-LABEL: combine_vec_urem_by_pow2d: 232; SSE: # %bb.0: 233; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 234; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 235; SSE-NEXT: movdqa %xmm3, %xmm4 236; SSE-NEXT: psrld %xmm2, %xmm4 237; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 238; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 239; SSE-NEXT: movdqa %xmm3, %xmm6 240; SSE-NEXT: psrld %xmm5, %xmm6 241; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] 242; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 243; SSE-NEXT: movdqa %xmm3, %xmm4 244; SSE-NEXT: psrld %xmm1, %xmm4 245; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 246; SSE-NEXT: psrld %xmm1, %xmm3 247; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 248; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] 249; SSE-NEXT: pcmpeqd %xmm1, %xmm1 250; SSE-NEXT: paddd %xmm3, %xmm1 251; SSE-NEXT: pand %xmm1, %xmm0 252; SSE-NEXT: retq 253; 254; AVX1-LABEL: combine_vec_urem_by_pow2d: 255; AVX1: # %bb.0: 256; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 257; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 258; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 259; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 260; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 261; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 262; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 263; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 264; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 265; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 266; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 267; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 268; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 269; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 270; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 271; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 272; AVX1-NEXT: retq 273; 274; AVX2-LABEL: combine_vec_urem_by_pow2d: 275; AVX2: # %bb.0: 276; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 277; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 278; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 279; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 280; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 281; AVX2-NEXT: retq 282 %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y 283 %2 = urem <4 x i32> %x, %1 284 ret <4 x i32> %2 285} 286 287; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) 288define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { 289; SSE-LABEL: combine_vec_urem_by_shl_pow2a: 290; SSE: # %bb.0: 291; SSE-NEXT: pslld $23, %xmm1 292; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 293; SSE-NEXT: cvttps2dq %xmm1, %xmm1 294; SSE-NEXT: pslld $2, %xmm1 295; SSE-NEXT: pcmpeqd %xmm2, %xmm2 296; SSE-NEXT: paddd %xmm1, %xmm2 297; SSE-NEXT: pand %xmm2, %xmm0 298; SSE-NEXT: retq 299; 300; AVX1-LABEL: combine_vec_urem_by_shl_pow2a: 301; AVX1: # %bb.0: 302; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 303; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 304; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 305; AVX1-NEXT: vpslld $2, %xmm1, %xmm1 306; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 307; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 308; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 309; AVX1-NEXT: retq 310; 311; AVX2-LABEL: combine_vec_urem_by_shl_pow2a: 312; AVX2: # %bb.0: 313; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] 314; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 315; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 316; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 317; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 318; AVX2-NEXT: retq 319 %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y 320 %2 = urem <4 x i32> %x, %1 321 ret <4 x i32> %2 322} 323 324define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { 325; SSE-LABEL: combine_vec_urem_by_shl_pow2b: 326; SSE: # %bb.0: 327; SSE-NEXT: pslld $23, %xmm1 328; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 329; SSE-NEXT: cvttps2dq %xmm1, %xmm1 330; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 331; SSE-NEXT: pcmpeqd %xmm2, %xmm2 332; SSE-NEXT: paddd %xmm1, %xmm2 333; SSE-NEXT: pand %xmm2, %xmm0 334; SSE-NEXT: retq 335; 336; AVX1-LABEL: combine_vec_urem_by_shl_pow2b: 337; AVX1: # %bb.0: 338; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 339; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 340; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 341; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 342; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 343; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 344; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 345; AVX1-NEXT: retq 346; 347; AVX2-LABEL: combine_vec_urem_by_shl_pow2b: 348; AVX2: # %bb.0: 349; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] 350; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 351; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 352; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 353; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 354; AVX2-NEXT: retq 355 %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y 356 %2 = urem <4 x i32> %x, %1 357 ret <4 x i32> %2 358} 359 360; fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) 361define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) { 362; SSE-LABEL: combine_vec_urem_by_lshr_pow2a: 363; SSE: # %bb.0: 364; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 365; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [4,4,4,4] 366; SSE-NEXT: movdqa %xmm3, %xmm4 367; SSE-NEXT: psrld %xmm2, %xmm4 368; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 369; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 370; SSE-NEXT: movdqa %xmm3, %xmm6 371; SSE-NEXT: psrld %xmm5, %xmm6 372; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] 373; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 374; SSE-NEXT: movdqa %xmm3, %xmm4 375; SSE-NEXT: psrld %xmm1, %xmm4 376; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 377; SSE-NEXT: psrld %xmm1, %xmm3 378; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 379; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] 380; SSE-NEXT: pcmpeqd %xmm1, %xmm1 381; SSE-NEXT: paddd %xmm3, %xmm1 382; SSE-NEXT: pand %xmm1, %xmm0 383; SSE-NEXT: retq 384; 385; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a: 386; AVX1: # %bb.0: 387; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 388; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4] 389; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 390; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 391; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 392; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 393; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 394; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 395; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 396; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 397; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 398; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 399; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 400; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 401; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 402; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 403; AVX1-NEXT: retq 404; 405; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a: 406; AVX2: # %bb.0: 407; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] 408; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 409; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 410; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 411; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 412; AVX2-NEXT: retq 413 %1 = lshr <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y 414 %2 = urem <4 x i32> %x, %1 415 ret <4 x i32> %2 416} 417 418define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { 419; SSE-LABEL: combine_vec_urem_by_lshr_pow2b: 420; SSE: # %bb.0: 421; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 422; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,4,8,16] 423; SSE-NEXT: movdqa %xmm3, %xmm4 424; SSE-NEXT: psrld %xmm2, %xmm4 425; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 426; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 427; SSE-NEXT: movdqa %xmm3, %xmm6 428; SSE-NEXT: psrld %xmm5, %xmm6 429; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] 430; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 431; SSE-NEXT: movdqa %xmm3, %xmm4 432; SSE-NEXT: psrld %xmm1, %xmm4 433; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 434; SSE-NEXT: psrld %xmm1, %xmm3 435; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 436; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] 437; SSE-NEXT: pcmpeqd %xmm1, %xmm1 438; SSE-NEXT: paddd %xmm3, %xmm1 439; SSE-NEXT: pand %xmm1, %xmm0 440; SSE-NEXT: retq 441; 442; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b: 443; AVX1: # %bb.0: 444; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 445; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,8,16] 446; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 447; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 448; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 449; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 450; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 451; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 452; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 453; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 454; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 455; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 456; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 457; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 458; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 459; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 460; AVX1-NEXT: retq 461; 462; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b: 463; AVX2: # %bb.0: 464; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] 465; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 466; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 467; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 468; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 469; AVX2-NEXT: retq 470 %1 = lshr <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y 471 %2 = urem <4 x i32> %x, %1 472 ret <4 x i32> %2 473} 474 475; FIXME: PR55271 - urem(undef, 3) != undef 476; Use PSLLI intrinsic to postpone the undef creation until after urem-by-constant expansion 477define <4 x i32> @combine_vec_urem_undef_by_3(<4 x i32> %in) { 478; CHECK-LABEL: combine_vec_urem_undef_by_3: 479; CHECK: # %bb.0: 480; CHECK-NEXT: retq 481 %x = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 0) 482 %y = urem <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 483 ret <4 x i32> %y 484} 485declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) 486 487define i1 @bool_urem(i1 %x, i1 %y) { 488; CHECK-LABEL: bool_urem: 489; CHECK: # %bb.0: 490; CHECK-NEXT: xorl %eax, %eax 491; CHECK-NEXT: retq 492 %r = urem i1 %x, %y 493 ret i1 %r 494} 495 496define <4 x i1> @boolvec_urem(<4 x i1> %x, <4 x i1> %y) { 497; SSE-LABEL: boolvec_urem: 498; SSE: # %bb.0: 499; SSE-NEXT: xorps %xmm0, %xmm0 500; SSE-NEXT: retq 501; 502; AVX-LABEL: boolvec_urem: 503; AVX: # %bb.0: 504; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 505; AVX-NEXT: retq 506 %r = urem <4 x i1> %x, %y 507 ret <4 x i1> %r 508} 509