1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP 7 8; fold (udiv x, 1) -> x 9define i32 @combine_udiv_by_one(i32 %x) { 10; CHECK-LABEL: combine_udiv_by_one: 11; CHECK: # %bb.0: 12; CHECK-NEXT: movl %edi, %eax 13; CHECK-NEXT: retq 14 %1 = udiv i32 %x, 1 15 ret i32 %1 16} 17 18define <4 x i32> @combine_vec_udiv_by_one(<4 x i32> %x) { 19; CHECK-LABEL: combine_vec_udiv_by_one: 20; CHECK: # %bb.0: 21; CHECK-NEXT: retq 22 %1 = udiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 23 ret <4 x i32> %1 24} 25 26; fold (udiv x, -1) -> select((icmp eq x, -1), 1, 0) 27define i32 @combine_udiv_by_negone(i32 %x) { 28; CHECK-LABEL: combine_udiv_by_negone: 29; CHECK: # %bb.0: 30; CHECK-NEXT: xorl %eax, %eax 31; CHECK-NEXT: cmpl $-1, %edi 32; CHECK-NEXT: sete %al 33; CHECK-NEXT: retq 34 %1 = udiv i32 %x, -1 35 ret i32 %1 36} 37 38define <4 x i32> @combine_vec_udiv_by_negone(<4 x i32> %x) { 39; SSE-LABEL: combine_vec_udiv_by_negone: 40; SSE: # %bb.0: 41; SSE-NEXT: pcmpeqd %xmm1, %xmm1 42; SSE-NEXT: pcmpeqd %xmm1, %xmm0 43; SSE-NEXT: psrld $31, %xmm0 44; SSE-NEXT: retq 45; 46; AVX-LABEL: combine_vec_udiv_by_negone: 47; AVX: # %bb.0: 48; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 49; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 50; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 51; AVX-NEXT: retq 52; 53; XOP-LABEL: combine_vec_udiv_by_negone: 54; XOP: # %bb.0: 55; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 56; XOP-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0 57; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 58; XOP-NEXT: retq 59 %1 = udiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 60 ret <4 x i32> %1 61} 62 63; fold (udiv x, INT_MIN) -> (srl x, 31) 64define i32 @combine_udiv_by_minsigned(i32 %x) { 65; CHECK-LABEL: combine_udiv_by_minsigned: 66; CHECK: # %bb.0: 67; CHECK-NEXT: movl %edi, %eax 68; CHECK-NEXT: shrl $31, %eax 69; CHECK-NEXT: retq 70 %1 = udiv i32 %x, -2147483648 71 ret i32 %1 72} 73 74define <4 x i32> @combine_vec_udiv_by_minsigned(<4 x i32> %x) { 75; SSE-LABEL: combine_vec_udiv_by_minsigned: 76; SSE: # %bb.0: 77; SSE-NEXT: psrld $31, %xmm0 78; SSE-NEXT: retq 79; 80; AVX-LABEL: combine_vec_udiv_by_minsigned: 81; AVX: # %bb.0: 82; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 83; AVX-NEXT: retq 84; 85; XOP-LABEL: combine_vec_udiv_by_minsigned: 86; XOP: # %bb.0: 87; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 88; XOP-NEXT: retq 89 %1 = udiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 90 ret <4 x i32> %1 91} 92 93; fold (udiv 0, x) -> 0 94define i32 @combine_udiv_zero(i32 %x) { 95; CHECK-LABEL: combine_udiv_zero: 96; CHECK: # %bb.0: 97; CHECK-NEXT: xorl %eax, %eax 98; CHECK-NEXT: retq 99 %1 = udiv i32 0, %x 100 ret i32 %1 101} 102 103define <4 x i32> @combine_vec_udiv_zero(<4 x i32> %x) { 104; SSE-LABEL: combine_vec_udiv_zero: 105; SSE: # %bb.0: 106; SSE-NEXT: xorps %xmm0, %xmm0 107; SSE-NEXT: retq 108; 109; AVX-LABEL: combine_vec_udiv_zero: 110; AVX: # %bb.0: 111; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 112; AVX-NEXT: retq 113; 114; XOP-LABEL: combine_vec_udiv_zero: 115; XOP: # %bb.0: 116; XOP-NEXT: vxorps %xmm0, %xmm0, %xmm0 117; XOP-NEXT: retq 118 %1 = udiv <4 x i32> zeroinitializer, %x 119 ret <4 x i32> %1 120} 121 122; fold (udiv x, x) -> 1 123define i32 @combine_udiv_dupe(i32 %x) { 124; CHECK-LABEL: combine_udiv_dupe: 125; CHECK: # %bb.0: 126; CHECK-NEXT: movl $1, %eax 127; CHECK-NEXT: retq 128 %1 = udiv i32 %x, %x 129 ret i32 %1 130} 131 132define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) { 133; SSE-LABEL: combine_vec_udiv_dupe: 134; SSE: # %bb.0: 135; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] 136; SSE-NEXT: retq 137; 138; AVX-LABEL: combine_vec_udiv_dupe: 139; AVX: # %bb.0: 140; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] 141; AVX-NEXT: retq 142; 143; XOP-LABEL: combine_vec_udiv_dupe: 144; XOP: # %bb.0: 145; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] 146; XOP-NEXT: retq 147 %1 = udiv <4 x i32> %x, %x 148 ret <4 x i32> %1 149} 150 151; fold (udiv x, (1 << c)) -> x >>u c 152define <4 x i32> @combine_vec_udiv_by_pow2a(<4 x i32> %x) { 153; SSE-LABEL: combine_vec_udiv_by_pow2a: 154; SSE: # %bb.0: 155; SSE-NEXT: psrld $2, %xmm0 156; SSE-NEXT: retq 157; 158; AVX-LABEL: combine_vec_udiv_by_pow2a: 159; AVX: # %bb.0: 160; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 161; AVX-NEXT: retq 162; 163; XOP-LABEL: combine_vec_udiv_by_pow2a: 164; XOP: # %bb.0: 165; XOP-NEXT: vpsrld $2, %xmm0, %xmm0 166; XOP-NEXT: retq 167 %1 = udiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 168 ret <4 x i32> %1 169} 170 171define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) { 172; SSE2-LABEL: combine_vec_udiv_by_pow2b: 173; SSE2: # %bb.0: 174; SSE2-NEXT: movdqa %xmm0, %xmm1 175; SSE2-NEXT: psrld $4, %xmm1 176; SSE2-NEXT: movdqa %xmm0, %xmm2 177; SSE2-NEXT: psrld $3, %xmm2 178; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 179; SSE2-NEXT: movdqa %xmm0, %xmm1 180; SSE2-NEXT: psrld $2, %xmm1 181; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 182; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 183; SSE2-NEXT: retq 184; 185; SSE41-LABEL: combine_vec_udiv_by_pow2b: 186; SSE41: # %bb.0: 187; SSE41-NEXT: movdqa %xmm0, %xmm1 188; SSE41-NEXT: psrld $4, %xmm1 189; SSE41-NEXT: movdqa %xmm0, %xmm2 190; SSE41-NEXT: psrld $2, %xmm2 191; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 192; SSE41-NEXT: movdqa %xmm0, %xmm1 193; SSE41-NEXT: psrld $3, %xmm1 194; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 195; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 196; SSE41-NEXT: retq 197; 198; AVX1-LABEL: combine_vec_udiv_by_pow2b: 199; AVX1: # %bb.0: 200; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 201; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 202; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 203; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 204; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 205; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 206; AVX1-NEXT: retq 207; 208; AVX2-LABEL: combine_vec_udiv_by_pow2b: 209; AVX2: # %bb.0: 210; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 211; AVX2-NEXT: retq 212; 213; XOP-LABEL: combine_vec_udiv_by_pow2b: 214; XOP: # %bb.0: 215; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 216; XOP-NEXT: retq 217 %1 = udiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 218 ret <4 x i32> %1 219} 220 221define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { 222; SSE2-LABEL: combine_vec_udiv_by_pow2c: 223; SSE2: # %bb.0: 224; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 225; SSE2-NEXT: movdqa %xmm0, %xmm3 226; SSE2-NEXT: psrld %xmm2, %xmm3 227; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 228; SSE2-NEXT: movdqa %xmm0, %xmm2 229; SSE2-NEXT: psrld %xmm4, %xmm2 230; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 231; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 232; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 233; SSE2-NEXT: movdqa %xmm0, %xmm4 234; SSE2-NEXT: psrld %xmm3, %xmm4 235; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 236; SSE2-NEXT: psrld %xmm1, %xmm0 237; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 238; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 239; SSE2-NEXT: movaps %xmm2, %xmm0 240; SSE2-NEXT: retq 241; 242; SSE41-LABEL: combine_vec_udiv_by_pow2c: 243; SSE41: # %bb.0: 244; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 245; SSE41-NEXT: movdqa %xmm0, %xmm3 246; SSE41-NEXT: psrld %xmm2, %xmm3 247; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 248; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 249; SSE41-NEXT: movdqa %xmm0, %xmm5 250; SSE41-NEXT: psrld %xmm4, %xmm5 251; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 252; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 253; SSE41-NEXT: movdqa %xmm0, %xmm3 254; SSE41-NEXT: psrld %xmm1, %xmm3 255; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 256; SSE41-NEXT: psrld %xmm1, %xmm0 257; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 258; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 259; SSE41-NEXT: retq 260; 261; AVX1-LABEL: combine_vec_udiv_by_pow2c: 262; AVX1: # %bb.0: 263; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 264; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 265; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 266; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 267; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 268; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 269; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 270; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 271; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 272; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 273; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 274; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 275; AVX1-NEXT: retq 276; 277; AVX2-LABEL: combine_vec_udiv_by_pow2c: 278; AVX2: # %bb.0: 279; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 280; AVX2-NEXT: retq 281; 282; XOP-LABEL: combine_vec_udiv_by_pow2c: 283; XOP: # %bb.0: 284; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 285; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 286; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 287; XOP-NEXT: retq 288 %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y 289 %2 = udiv <4 x i32> %x, %1 290 ret <4 x i32> %2 291} 292 293; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 294define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { 295; SSE2-LABEL: combine_vec_udiv_by_shl_pow2a: 296; SSE2: # %bb.0: 297; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 298; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 299; SSE2-NEXT: movdqa %xmm0, %xmm3 300; SSE2-NEXT: psrld %xmm2, %xmm3 301; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 302; SSE2-NEXT: movdqa %xmm0, %xmm2 303; SSE2-NEXT: psrld %xmm4, %xmm2 304; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 305; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 306; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 307; SSE2-NEXT: movdqa %xmm0, %xmm4 308; SSE2-NEXT: psrld %xmm3, %xmm4 309; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 310; SSE2-NEXT: psrld %xmm1, %xmm0 311; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 312; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 313; SSE2-NEXT: movaps %xmm2, %xmm0 314; SSE2-NEXT: retq 315; 316; SSE41-LABEL: combine_vec_udiv_by_shl_pow2a: 317; SSE41: # %bb.0: 318; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 319; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 320; SSE41-NEXT: movdqa %xmm0, %xmm3 321; SSE41-NEXT: psrld %xmm2, %xmm3 322; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 323; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 324; SSE41-NEXT: movdqa %xmm0, %xmm5 325; SSE41-NEXT: psrld %xmm4, %xmm5 326; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 327; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 328; SSE41-NEXT: movdqa %xmm0, %xmm3 329; SSE41-NEXT: psrld %xmm1, %xmm3 330; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 331; SSE41-NEXT: psrld %xmm1, %xmm0 332; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 333; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 334; SSE41-NEXT: retq 335; 336; AVX1-LABEL: combine_vec_udiv_by_shl_pow2a: 337; AVX1: # %bb.0: 338; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 339; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 340; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 341; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 342; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 343; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 344; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 345; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 346; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 347; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 348; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 349; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 350; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 351; AVX1-NEXT: retq 352; 353; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: 354; AVX2: # %bb.0: 355; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] 356; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 357; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 358; AVX2-NEXT: retq 359; 360; XOP-LABEL: combine_vec_udiv_by_shl_pow2a: 361; XOP: # %bb.0: 362; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294] 363; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 364; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 365; XOP-NEXT: retq 366 %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y 367 %2 = udiv <4 x i32> %x, %1 368 ret <4 x i32> %2 369} 370 371define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { 372; SSE2-LABEL: combine_vec_udiv_by_shl_pow2b: 373; SSE2: # %bb.0: 374; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 375; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 376; SSE2-NEXT: movdqa %xmm0, %xmm3 377; SSE2-NEXT: psrld %xmm2, %xmm3 378; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 379; SSE2-NEXT: movdqa %xmm0, %xmm2 380; SSE2-NEXT: psrld %xmm4, %xmm2 381; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 382; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 383; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 384; SSE2-NEXT: movdqa %xmm0, %xmm4 385; SSE2-NEXT: psrld %xmm3, %xmm4 386; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 387; SSE2-NEXT: psrld %xmm1, %xmm0 388; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 389; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 390; SSE2-NEXT: movaps %xmm2, %xmm0 391; SSE2-NEXT: retq 392; 393; SSE41-LABEL: combine_vec_udiv_by_shl_pow2b: 394; SSE41: # %bb.0: 395; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 396; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 397; SSE41-NEXT: movdqa %xmm0, %xmm3 398; SSE41-NEXT: psrld %xmm2, %xmm3 399; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 400; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 401; SSE41-NEXT: movdqa %xmm0, %xmm5 402; SSE41-NEXT: psrld %xmm4, %xmm5 403; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 404; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 405; SSE41-NEXT: movdqa %xmm0, %xmm3 406; SSE41-NEXT: psrld %xmm1, %xmm3 407; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 408; SSE41-NEXT: psrld %xmm1, %xmm0 409; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 410; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 411; SSE41-NEXT: retq 412; 413; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b: 414; AVX1: # %bb.0: 415; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 416; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 417; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 418; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 419; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 420; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 421; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 422; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm1 423; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 424; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 425; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 426; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 427; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 428; AVX1-NEXT: retq 429; 430; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b: 431; AVX2: # %bb.0: 432; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 433; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 434; AVX2-NEXT: retq 435; 436; XOP-LABEL: combine_vec_udiv_by_shl_pow2b: 437; XOP: # %bb.0: 438; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] 439; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 440; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 441; XOP-NEXT: retq 442 %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y 443 %2 = udiv <4 x i32> %x, %1 444 ret <4 x i32> %2 445} 446 447; fold (udiv x, c1) 448define i32 @combine_udiv_uniform(i32 %x) { 449; CHECK-LABEL: combine_udiv_uniform: 450; CHECK: # %bb.0: 451; CHECK-NEXT: movl %edi, %ecx 452; CHECK-NEXT: movl $2987803337, %eax # imm = 0xB21642C9 453; CHECK-NEXT: imulq %rcx, %rax 454; CHECK-NEXT: shrq $36, %rax 455; CHECK-NEXT: # kill: def $eax killed $eax killed $rax 456; CHECK-NEXT: retq 457 %1 = udiv i32 %x, 23 458 ret i32 %1 459} 460 461define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) { 462; SSE-LABEL: combine_vec_udiv_uniform: 463; SSE: # %bb.0: 464; SSE-NEXT: movdqa {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645] 465; SSE-NEXT: pmulhuw %xmm0, %xmm1 466; SSE-NEXT: psubw %xmm1, %xmm0 467; SSE-NEXT: psrlw $1, %xmm0 468; SSE-NEXT: paddw %xmm1, %xmm0 469; SSE-NEXT: psrlw $4, %xmm0 470; SSE-NEXT: retq 471; 472; AVX-LABEL: combine_vec_udiv_uniform: 473; AVX: # %bb.0: 474; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645] 475; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 476; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 477; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 478; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 479; AVX-NEXT: retq 480; 481; XOP-LABEL: combine_vec_udiv_uniform: 482; XOP: # %bb.0: 483; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645] 484; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0 485; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 486; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 487; XOP-NEXT: vpsrlw $4, %xmm0, %xmm0 488; XOP-NEXT: retq 489 %1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23> 490 ret <8 x i16> %1 491} 492 493define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { 494; SSE2-LABEL: combine_vec_udiv_nonuniform: 495; SSE2: # %bb.0: 496; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] 497; SSE2-NEXT: movdqa %xmm0, %xmm2 498; SSE2-NEXT: pand %xmm1, %xmm2 499; SSE2-NEXT: movdqa %xmm0, %xmm3 500; SSE2-NEXT: psrlw $3, %xmm3 501; SSE2-NEXT: pandn %xmm3, %xmm1 502; SSE2-NEXT: por %xmm2, %xmm1 503; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] 504; SSE2-NEXT: psubw %xmm1, %xmm0 505; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 506; SSE2-NEXT: paddw %xmm1, %xmm0 507; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0] 508; SSE2-NEXT: pandn %xmm0, %xmm1 509; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 510; SSE2-NEXT: por %xmm1, %xmm0 511; SSE2-NEXT: retq 512; 513; SSE41-LABEL: combine_vec_udiv_nonuniform: 514; SSE41: # %bb.0: 515; SSE41-NEXT: movdqa %xmm0, %xmm1 516; SSE41-NEXT: psrlw $3, %xmm1 517; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 518; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] 519; SSE41-NEXT: psubw %xmm1, %xmm0 520; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 521; SSE41-NEXT: paddw %xmm1, %xmm0 522; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u] 523; SSE41-NEXT: pmulhuw %xmm0, %xmm1 524; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] 525; SSE41-NEXT: retq 526; 527; AVX-LABEL: combine_vec_udiv_nonuniform: 528; AVX: # %bb.0: 529; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1 530; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 531; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] 532; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 533; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 534; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 535; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u] 536; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7] 537; AVX-NEXT: retq 538; 539; XOP-LABEL: combine_vec_udiv_nonuniform: 540; XOP: # %bb.0: 541; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 542; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2] 543; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0 544; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 545; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 546; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 547; XOP-NEXT: retq 548 %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768> 549 ret <8 x i16> %1 550} 551 552define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { 553; SSE2-LABEL: combine_vec_udiv_nonuniform2: 554; SSE2: # %bb.0: 555; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 556; SSE2-NEXT: movdqa %xmm0, %xmm2 557; SSE2-NEXT: pand %xmm1, %xmm2 558; SSE2-NEXT: psrlw $1, %xmm0 559; SSE2-NEXT: pandn %xmm0, %xmm1 560; SSE2-NEXT: por %xmm2, %xmm1 561; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [16393,59919,58255,32787,55189,8197,52429,32789] 562; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,2048,2048,2,2048,8,2048,2] 563; SSE2-NEXT: movdqa %xmm1, %xmm0 564; SSE2-NEXT: retq 565; 566; SSE41-LABEL: combine_vec_udiv_nonuniform2: 567; SSE41: # %bb.0: 568; SSE41-NEXT: movdqa %xmm0, %xmm1 569; SSE41-NEXT: psrlw $1, %xmm1 570; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 571; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789] 572; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,2048,2048,2,2048,8,2048,2] 573; SSE41-NEXT: retq 574; 575; AVX-LABEL: combine_vec_udiv_nonuniform2: 576; AVX: # %bb.0: 577; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 578; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 579; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789] 580; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,2048,2048,2,2048,8,2048,2] 581; AVX-NEXT: retq 582; 583; XOP-LABEL: combine_vec_udiv_nonuniform2: 584; XOP: # %bb.0: 585; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 586; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789] 587; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 588; XOP-NEXT: retq 589 %1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41> 590 ret <8 x i16> %1 591} 592 593define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { 594; SSE-LABEL: combine_vec_udiv_nonuniform3: 595; SSE: # %bb.0: 596; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517] 597; SSE-NEXT: pmulhuw %xmm0, %xmm1 598; SSE-NEXT: psubw %xmm1, %xmm0 599; SSE-NEXT: psrlw $1, %xmm0 600; SSE-NEXT: paddw %xmm1, %xmm0 601; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024] 602; SSE-NEXT: retq 603; 604; AVX-LABEL: combine_vec_udiv_nonuniform3: 605; AVX: # %bb.0: 606; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517] 607; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 608; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 609; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 610; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024] 611; AVX-NEXT: retq 612; 613; XOP-LABEL: combine_vec_udiv_nonuniform3: 614; XOP: # %bb.0: 615; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517] 616; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0 617; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 618; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 619; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 620; XOP-NEXT: retq 621 %1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127> 622 ret <8 x i16> %1 623} 624 625define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { 626; SSE2-LABEL: combine_vec_udiv_nonuniform4: 627; SSE2: # %bb.0: 628; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 629; SSE2-NEXT: movdqa %xmm0, %xmm2 630; SSE2-NEXT: pand %xmm1, %xmm2 631; SSE2-NEXT: pxor %xmm3, %xmm3 632; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 633; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 634; SSE2-NEXT: psrlw $15, %xmm0 635; SSE2-NEXT: pandn %xmm0, %xmm1 636; SSE2-NEXT: por %xmm2, %xmm1 637; SSE2-NEXT: movdqa %xmm1, %xmm0 638; SSE2-NEXT: retq 639; 640; SSE41-LABEL: combine_vec_udiv_nonuniform4: 641; SSE41: # %bb.0: 642; SSE41-NEXT: movdqa %xmm0, %xmm1 643; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 644; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 645; SSE41-NEXT: psrlw $8, %xmm2 646; SSE41-NEXT: packuswb %xmm2, %xmm2 647; SSE41-NEXT: psrlw $7, %xmm2 648; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 649; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 650; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 651; SSE41-NEXT: movdqa %xmm2, %xmm0 652; SSE41-NEXT: retq 653; 654; AVX-LABEL: combine_vec_udiv_nonuniform4: 655; AVX: # %bb.0: 656; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 657; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 658; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 659; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 660; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 661; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 662; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 663; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 664; AVX-NEXT: retq 665; 666; XOP-LABEL: combine_vec_udiv_nonuniform4: 667; XOP: # %bb.0: 668; XOP-NEXT: movl $171, %eax 669; XOP-NEXT: vmovd %eax, %xmm1 670; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 671; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1 672; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1 673; XOP-NEXT: movl $249, %eax 674; XOP-NEXT: vmovd %eax, %xmm2 675; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 676; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] 677; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 678; XOP-NEXT: retq 679 %div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 680 ret <16 x i8> %div 681} 682 683define <8 x i16> @pr38477(<8 x i16> %a0) { 684; SSE2-LABEL: pr38477: 685; SSE2: # %bb.0: 686; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 687; SSE2-NEXT: movdqa %xmm1, %xmm2 688; SSE2-NEXT: pandn %xmm0, %xmm2 689; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [u,4957,57457,4103,16385,35545,2048,2115] 690; SSE2-NEXT: pmulhuw %xmm0, %xmm3 691; SSE2-NEXT: psubw %xmm3, %xmm0 692; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768] 693; SSE2-NEXT: paddw %xmm3, %xmm0 694; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 695; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 696; SSE2-NEXT: por %xmm3, %xmm0 697; SSE2-NEXT: pand %xmm1, %xmm0 698; SSE2-NEXT: por %xmm2, %xmm0 699; SSE2-NEXT: retq 700; 701; SSE41-LABEL: pr38477: 702; SSE41: # %bb.0: 703; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115] 704; SSE41-NEXT: pmulhuw %xmm0, %xmm1 705; SSE41-NEXT: movdqa %xmm0, %xmm2 706; SSE41-NEXT: psubw %xmm1, %xmm2 707; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768] 708; SSE41-NEXT: paddw %xmm1, %xmm2 709; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,1024,1024,16,4,1024,u,4096] 710; SSE41-NEXT: pmulhuw %xmm2, %xmm1 711; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7] 712; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 713; SSE41-NEXT: retq 714; 715; AVX-LABEL: pr38477: 716; AVX: # %bb.0: 717; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115] 718; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 719; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768] 720; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 721; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,1024,1024,16,4,1024,u,4096] 722; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] 723; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 724; AVX-NEXT: retq 725; 726; XOP-LABEL: pr38477: 727; XOP: # %bb.0: 728; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115] 729; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm2 730; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768] 731; XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1 732; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 733; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 734; XOP-NEXT: retq 735 %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31> 736 ret <8 x i16> %1 737} 738 739define i1 @bool_udiv(i1 %x, i1 %y) { 740; CHECK-LABEL: bool_udiv: 741; CHECK: # %bb.0: 742; CHECK-NEXT: movl %edi, %eax 743; CHECK-NEXT: # kill: def $al killed $al killed $eax 744; CHECK-NEXT: retq 745 %r = udiv i1 %x, %y 746 ret i1 %r 747} 748 749define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) { 750; CHECK-LABEL: boolvec_udiv: 751; CHECK: # %bb.0: 752; CHECK-NEXT: retq 753 %r = udiv <4 x i1> %x, %y 754 ret <4 x i1> %r 755} 756 757define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) { 758; SSE2-LABEL: vector_div_leading_zeros: 759; SSE2: # %bb.0: 760; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 761; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 762; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 763; SSE2-NEXT: pmuludq %xmm1, %xmm0 764; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 765; SSE2-NEXT: pmuludq %xmm1, %xmm2 766; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 767; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 768; SSE2-NEXT: retq 769; 770; SSE41-LABEL: vector_div_leading_zeros: 771; SSE41: # %bb.0: 772; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 773; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 774; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 775; SSE41-NEXT: pmuludq %xmm2, %xmm1 776; SSE41-NEXT: pmuludq %xmm2, %xmm0 777; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 778; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 779; SSE41-NEXT: retq 780; 781; AVX1-LABEL: vector_div_leading_zeros: 782; AVX1: # %bb.0: 783; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 784; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 785; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 786; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 787; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 788; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 789; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 790; AVX1-NEXT: retq 791; 792; AVX2-LABEL: vector_div_leading_zeros: 793; AVX2: # %bb.0: 794; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 795; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 796; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 797; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 798; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 799; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 800; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 801; AVX2-NEXT: retq 802; 803; XOP-LABEL: vector_div_leading_zeros: 804; XOP: # %bb.0: 805; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 806; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 807; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] 808; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 809; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 810; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 811; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 812; XOP-NEXT: retq 813 %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 814 %b = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 815 ret <4 x i32> %b 816} 817