1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP 9 10; fold (sdiv x, 1) -> x 11define i32 @combine_sdiv_by_one(i32 %x) { 12; CHECK-LABEL: combine_sdiv_by_one: 13; CHECK: # %bb.0: 14; CHECK-NEXT: movl %edi, %eax 15; CHECK-NEXT: retq 16 %1 = sdiv i32 %x, 1 17 ret i32 %1 18} 19 20define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { 21; CHECK-LABEL: combine_vec_sdiv_by_one: 22; CHECK: # %bb.0: 23; CHECK-NEXT: retq 24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 25 ret <4 x i32> %1 26} 27 28; fold (sdiv x, -1) -> 0 - x 29define i32 @combine_sdiv_by_negone(i32 %x) { 30; CHECK-LABEL: combine_sdiv_by_negone: 31; CHECK: # %bb.0: 32; CHECK-NEXT: movl %edi, %eax 33; CHECK-NEXT: negl %eax 34; CHECK-NEXT: retq 35 %1 = sdiv i32 %x, -1 36 ret i32 %1 37} 38 39define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { 40; SSE-LABEL: combine_vec_sdiv_by_negone: 41; SSE: # %bb.0: 42; SSE-NEXT: pxor %xmm1, %xmm1 43; SSE-NEXT: psubd %xmm0, %xmm1 44; SSE-NEXT: movdqa %xmm1, %xmm0 45; SSE-NEXT: retq 46; 47; AVX-LABEL: combine_vec_sdiv_by_negone: 48; AVX: # %bb.0: 49; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 50; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 51; AVX-NEXT: retq 52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 53 ret <4 x i32> %1 54} 55 56; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0) 57define i32 @combine_sdiv_by_minsigned(i32 %x) { 58; CHECK-LABEL: combine_sdiv_by_minsigned: 59; CHECK: # %bb.0: 60; CHECK-NEXT: xorl %eax, %eax 61; CHECK-NEXT: negl %edi 62; CHECK-NEXT: seto %al 63; CHECK-NEXT: retq 64 %1 = sdiv i32 %x, -2147483648 65 ret i32 %1 66} 67 68define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { 69; SSE-LABEL: combine_vec_sdiv_by_minsigned: 70; SSE: # %bb.0: 71; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 72; SSE-NEXT: psrld $31, %xmm0 73; SSE-NEXT: retq 74; 75; AVX1-LABEL: combine_vec_sdiv_by_minsigned: 76; AVX1: # %bb.0: 77; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 78; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 79; AVX1-NEXT: retq 80; 81; AVX2-LABEL: combine_vec_sdiv_by_minsigned: 82; AVX2: # %bb.0: 83; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 84; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 85; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 86; AVX2-NEXT: retq 87; 88; AVX512F-LABEL: combine_vec_sdiv_by_minsigned: 89; AVX512F: # %bb.0: 90; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 91; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 92; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0 93; AVX512F-NEXT: retq 94; 95; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned: 96; AVX512BW: # %bb.0: 97; AVX512BW-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 98; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1] 99; AVX512BW-NEXT: retq 100; 101; XOP-LABEL: combine_vec_sdiv_by_minsigned: 102; XOP: # %bb.0: 103; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 104; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 105; XOP-NEXT: retq 106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 107 ret <4 x i32> %1 108} 109 110; fold (sdiv 0, x) -> 0 111define i32 @combine_sdiv_zero(i32 %x) { 112; CHECK-LABEL: combine_sdiv_zero: 113; CHECK: # %bb.0: 114; CHECK-NEXT: xorl %eax, %eax 115; CHECK-NEXT: retq 116 %1 = sdiv i32 0, %x 117 ret i32 %1 118} 119 120define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) { 121; SSE-LABEL: combine_vec_sdiv_zero: 122; SSE: # %bb.0: 123; SSE-NEXT: xorps %xmm0, %xmm0 124; SSE-NEXT: retq 125; 126; AVX-LABEL: combine_vec_sdiv_zero: 127; AVX: # %bb.0: 128; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 129; AVX-NEXT: retq 130 %1 = sdiv <4 x i32> zeroinitializer, %x 131 ret <4 x i32> %1 132} 133 134; fold (sdiv x, x) -> 1 135define i32 @combine_sdiv_dupe(i32 %x) { 136; CHECK-LABEL: combine_sdiv_dupe: 137; CHECK: # %bb.0: 138; CHECK-NEXT: movl $1, %eax 139; CHECK-NEXT: retq 140 %1 = sdiv i32 %x, %x 141 ret i32 %1 142} 143 144define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { 145; SSE-LABEL: combine_vec_sdiv_dupe: 146; SSE: # %bb.0: 147; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] 148; SSE-NEXT: retq 149; 150; AVX-LABEL: combine_vec_sdiv_dupe: 151; AVX: # %bb.0: 152; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] 153; AVX-NEXT: retq 154 %1 = sdiv <4 x i32> %x, %x 155 ret <4 x i32> %1 156} 157 158; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive 159define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { 160; SSE-LABEL: combine_vec_sdiv_by_pos0: 161; SSE: # %bb.0: 162; SSE-NEXT: psrld $2, %xmm0 163; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 164; SSE-NEXT: retq 165; 166; AVX1-LABEL: combine_vec_sdiv_by_pos0: 167; AVX1: # %bb.0: 168; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 169; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 170; AVX1-NEXT: retq 171; 172; AVX2-LABEL: combine_vec_sdiv_by_pos0: 173; AVX2: # %bb.0: 174; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0 175; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] 176; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 177; AVX2-NEXT: retq 178; 179; AVX512F-LABEL: combine_vec_sdiv_by_pos0: 180; AVX512F: # %bb.0: 181; AVX512F-NEXT: vpsrld $2, %xmm0, %xmm0 182; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] 183; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 184; AVX512F-NEXT: retq 185; 186; AVX512BW-LABEL: combine_vec_sdiv_by_pos0: 187; AVX512BW: # %bb.0: 188; AVX512BW-NEXT: vpsrld $2, %xmm0, %xmm0 189; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 190; AVX512BW-NEXT: retq 191; 192; XOP-LABEL: combine_vec_sdiv_by_pos0: 193; XOP: # %bb.0: 194; XOP-NEXT: vpsrld $2, %xmm0, %xmm0 195; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 196; XOP-NEXT: retq 197 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 198 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 199 ret <4 x i32> %2 200} 201 202define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { 203; SSE2-LABEL: combine_vec_sdiv_by_pos1: 204; SSE2: # %bb.0: 205; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 206; SSE2-NEXT: movdqa %xmm0, %xmm1 207; SSE2-NEXT: psrld $4, %xmm1 208; SSE2-NEXT: movdqa %xmm0, %xmm2 209; SSE2-NEXT: psrld $3, %xmm2 210; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 211; SSE2-NEXT: movdqa %xmm0, %xmm1 212; SSE2-NEXT: psrld $2, %xmm1 213; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 214; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 215; SSE2-NEXT: retq 216; 217; SSE41-LABEL: combine_vec_sdiv_by_pos1: 218; SSE41: # %bb.0: 219; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 220; SSE41-NEXT: movdqa %xmm0, %xmm1 221; SSE41-NEXT: psrld $4, %xmm1 222; SSE41-NEXT: movdqa %xmm0, %xmm2 223; SSE41-NEXT: psrld $2, %xmm2 224; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 225; SSE41-NEXT: movdqa %xmm0, %xmm1 226; SSE41-NEXT: psrld $3, %xmm1 227; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 228; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 229; SSE41-NEXT: retq 230; 231; AVX1-LABEL: combine_vec_sdiv_by_pos1: 232; AVX1: # %bb.0: 233; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 234; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 235; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 236; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 237; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 238; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 239; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 240; AVX1-NEXT: retq 241; 242; AVX2-LABEL: combine_vec_sdiv_by_pos1: 243; AVX2: # %bb.0: 244; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 245; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 246; AVX2-NEXT: retq 247; 248; AVX512F-LABEL: combine_vec_sdiv_by_pos1: 249; AVX512F: # %bb.0: 250; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 251; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 252; AVX512F-NEXT: retq 253; 254; AVX512BW-LABEL: combine_vec_sdiv_by_pos1: 255; AVX512BW: # %bb.0: 256; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 257; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 258; AVX512BW-NEXT: retq 259; 260; XOP-LABEL: combine_vec_sdiv_by_pos1: 261; XOP: # %bb.0: 262; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 263; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 264; XOP-NEXT: retq 265 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 266 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16> 267 ret <4 x i32> %2 268} 269 270; fold (sdiv x, (1 << c)) -> x >>u c 271define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { 272; SSE-LABEL: combine_vec_sdiv_by_pow2a: 273; SSE: # %bb.0: 274; SSE-NEXT: movdqa %xmm0, %xmm1 275; SSE-NEXT: psrad $31, %xmm1 276; SSE-NEXT: psrld $30, %xmm1 277; SSE-NEXT: paddd %xmm1, %xmm0 278; SSE-NEXT: psrad $2, %xmm0 279; SSE-NEXT: retq 280; 281; AVX-LABEL: combine_vec_sdiv_by_pow2a: 282; AVX: # %bb.0: 283; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 284; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 285; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 286; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 287; AVX-NEXT: retq 288 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 289 ret <4 x i32> %1 290} 291 292define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { 293; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg: 294; SSE: # %bb.0: 295; SSE-NEXT: movdqa %xmm0, %xmm1 296; SSE-NEXT: psrad $31, %xmm1 297; SSE-NEXT: psrld $30, %xmm1 298; SSE-NEXT: paddd %xmm0, %xmm1 299; SSE-NEXT: psrad $2, %xmm1 300; SSE-NEXT: pxor %xmm0, %xmm0 301; SSE-NEXT: psubd %xmm1, %xmm0 302; SSE-NEXT: retq 303; 304; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg: 305; AVX: # %bb.0: 306; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 307; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 308; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 309; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 310; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 311; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 312; AVX-NEXT: retq 313 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4> 314 ret <4 x i32> %1 315} 316 317define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { 318; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 319; SSE2: # %bb.0: 320; SSE2-NEXT: pxor %xmm1, %xmm1 321; SSE2-NEXT: pxor %xmm2, %xmm2 322; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 323; SSE2-NEXT: movdqa %xmm2, %xmm3 324; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 325; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2] 326; SSE2-NEXT: pmullw %xmm4, %xmm3 327; SSE2-NEXT: psrlw $8, %xmm3 328; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 329; SSE2-NEXT: pmullw %xmm4, %xmm2 330; SSE2-NEXT: psrlw $8, %xmm2 331; SSE2-NEXT: packuswb %xmm3, %xmm2 332; SSE2-NEXT: paddb %xmm0, %xmm2 333; SSE2-NEXT: movdqa %xmm2, %xmm1 334; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 335; SSE2-NEXT: psraw $8, %xmm1 336; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 337; SSE2-NEXT: pmullw %xmm3, %xmm1 338; SSE2-NEXT: psrlw $8, %xmm1 339; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 340; SSE2-NEXT: psraw $8, %xmm2 341; SSE2-NEXT: pmullw %xmm3, %xmm2 342; SSE2-NEXT: psrlw $8, %xmm2 343; SSE2-NEXT: packuswb %xmm1, %xmm2 344; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 345; SSE2-NEXT: pand %xmm1, %xmm2 346; SSE2-NEXT: pandn %xmm0, %xmm1 347; SSE2-NEXT: por %xmm2, %xmm1 348; SSE2-NEXT: movdqa %xmm1, %xmm0 349; SSE2-NEXT: retq 350; 351; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 352; SSE41: # %bb.0: 353; SSE41-NEXT: movdqa %xmm0, %xmm1 354; SSE41-NEXT: pxor %xmm0, %xmm0 355; SSE41-NEXT: pxor %xmm3, %xmm3 356; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 357; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 358; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 359; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2] 360; SSE41-NEXT: pmullw %xmm0, %xmm3 361; SSE41-NEXT: psrlw $8, %xmm3 362; SSE41-NEXT: pmullw %xmm0, %xmm2 363; SSE41-NEXT: psrlw $8, %xmm2 364; SSE41-NEXT: packuswb %xmm3, %xmm2 365; SSE41-NEXT: paddb %xmm1, %xmm2 366; SSE41-NEXT: movdqa %xmm2, %xmm0 367; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 368; SSE41-NEXT: psraw $8, %xmm0 369; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 370; SSE41-NEXT: pmullw %xmm3, %xmm0 371; SSE41-NEXT: psrlw $8, %xmm0 372; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 373; SSE41-NEXT: psraw $8, %xmm2 374; SSE41-NEXT: pmullw %xmm3, %xmm2 375; SSE41-NEXT: psrlw $8, %xmm2 376; SSE41-NEXT: packuswb %xmm0, %xmm2 377; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 378; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 379; SSE41-NEXT: movdqa %xmm1, %xmm0 380; SSE41-NEXT: retq 381; 382; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 383; AVX1: # %bb.0: 384; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 385; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 386; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 387; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2] 388; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 389; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 390; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 391; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 392; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 393; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 394; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 395; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 396; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 397; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 398; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 399; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 400; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 401; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 402; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 403; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 404; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 405; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] 406; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 407; AVX1-NEXT: retq 408; 409; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 410; AVX2: # %bb.0: 411; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 412; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 413; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 414; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,4,2,16,8,32,64,2,256,4,2,16,8,32,64,2] 415; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 416; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 417; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 418; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 419; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 420; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,64,128,16,32,8,4,128,256,64,128,16,32,8,4,128] 421; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 422; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 423; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 424; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] 425; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 426; AVX2-NEXT: vzeroupper 427; AVX2-NEXT: retq 428; 429; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 430; AVX512F: # %bb.0: 431; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 432; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 433; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 434; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 435; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 436; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 437; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 438; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 439; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 440; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] 441; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 442; AVX512F-NEXT: vzeroupper 443; AVX512F-NEXT: retq 444; 445; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 446; AVX512BW: # %bb.0: 447; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 448; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 449; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 450; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 451; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 452; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1 453; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 454; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 455; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 456; AVX512BW-NEXT: movw $257, %ax # imm = 0x101 457; AVX512BW-NEXT: kmovd %eax, %k1 458; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 459; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 460; AVX512BW-NEXT: vzeroupper 461; AVX512BW-NEXT: retq 462; 463; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 464; XOP: # %bb.0: 465; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 466; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 467; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 468; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 469; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 470; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] 471; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 472; XOP-NEXT: retq 473 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2> 474 ret <16 x i8> %1 475} 476 477define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { 478; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 479; SSE2: # %bb.0: 480; SSE2-NEXT: movdqa %xmm0, %xmm1 481; SSE2-NEXT: psraw $15, %xmm1 482; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,4,2,16,8,32,64,2] 483; SSE2-NEXT: paddw %xmm0, %xmm1 484; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535] 485; SSE2-NEXT: movdqa %xmm1, %xmm3 486; SSE2-NEXT: pand %xmm2, %xmm3 487; SSE2-NEXT: psraw $4, %xmm1 488; SSE2-NEXT: pandn %xmm1, %xmm2 489; SSE2-NEXT: por %xmm3, %xmm2 490; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535] 491; SSE2-NEXT: movdqa %xmm2, %xmm3 492; SSE2-NEXT: pand %xmm1, %xmm3 493; SSE2-NEXT: psraw $2, %xmm2 494; SSE2-NEXT: pandn %xmm2, %xmm1 495; SSE2-NEXT: por %xmm3, %xmm1 496; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0] 497; SSE2-NEXT: movdqa %xmm1, %xmm3 498; SSE2-NEXT: pand %xmm2, %xmm3 499; SSE2-NEXT: psraw $1, %xmm1 500; SSE2-NEXT: pandn %xmm1, %xmm2 501; SSE2-NEXT: por %xmm3, %xmm2 502; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 503; SSE2-NEXT: pand %xmm1, %xmm2 504; SSE2-NEXT: pandn %xmm0, %xmm1 505; SSE2-NEXT: por %xmm2, %xmm1 506; SSE2-NEXT: movdqa %xmm1, %xmm0 507; SSE2-NEXT: retq 508; 509; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 510; SSE41: # %bb.0: 511; SSE41-NEXT: movdqa %xmm0, %xmm1 512; SSE41-NEXT: psraw $15, %xmm1 513; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,4,2,16,8,32,64,2] 514; SSE41-NEXT: paddw %xmm0, %xmm1 515; SSE41-NEXT: movdqa %xmm1, %xmm2 516; SSE41-NEXT: psraw $1, %xmm2 517; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,16384,u,4096,8192,2048,1024,u] 518; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 519; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 520; SSE41-NEXT: retq 521; 522; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 523; AVX1: # %bb.0: 524; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 525; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2] 526; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 527; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 528; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u] 529; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 530; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 531; AVX1-NEXT: retq 532; 533; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 534; AVX2: # %bb.0: 535; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 536; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2] 537; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 538; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2 539; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u] 540; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 541; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 542; AVX2-NEXT: retq 543; 544; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 545; AVX512F: # %bb.0: 546; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1 547; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2] 548; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1 549; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 550; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 551; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 552; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 553; AVX512F-NEXT: vzeroupper 554; AVX512F-NEXT: retq 555; 556; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 557; AVX512BW: # %bb.0: 558; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1 559; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 560; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1 561; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 562; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 563; AVX512BW-NEXT: retq 564; 565; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 566; XOP: # %bb.0: 567; XOP-NEXT: vpsraw $15, %xmm0, %xmm1 568; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 569; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1 570; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 571; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 572; XOP-NEXT: retq 573 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 574 ret <8 x i16> %1 575} 576 577define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { 578; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 579; SSE2: # %bb.0: 580; SSE2-NEXT: movdqa %xmm0, %xmm3 581; SSE2-NEXT: psraw $15, %xmm0 582; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [u,4,2,16,8,32,64,2] 583; SSE2-NEXT: pmulhuw %xmm7, %xmm0 584; SSE2-NEXT: paddw %xmm3, %xmm0 585; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535] 586; SSE2-NEXT: movdqa %xmm0, %xmm2 587; SSE2-NEXT: pand %xmm4, %xmm2 588; SSE2-NEXT: psraw $4, %xmm0 589; SSE2-NEXT: movdqa %xmm4, %xmm6 590; SSE2-NEXT: pandn %xmm0, %xmm6 591; SSE2-NEXT: por %xmm2, %xmm6 592; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535] 593; SSE2-NEXT: movdqa %xmm6, %xmm0 594; SSE2-NEXT: pand %xmm5, %xmm0 595; SSE2-NEXT: psraw $2, %xmm6 596; SSE2-NEXT: movdqa %xmm5, %xmm2 597; SSE2-NEXT: pandn %xmm6, %xmm2 598; SSE2-NEXT: por %xmm0, %xmm2 599; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0] 600; SSE2-NEXT: movdqa %xmm2, %xmm0 601; SSE2-NEXT: pand %xmm6, %xmm0 602; SSE2-NEXT: psraw $1, %xmm2 603; SSE2-NEXT: movdqa %xmm6, %xmm8 604; SSE2-NEXT: pandn %xmm2, %xmm8 605; SSE2-NEXT: por %xmm0, %xmm8 606; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 607; SSE2-NEXT: pand %xmm2, %xmm8 608; SSE2-NEXT: movdqa %xmm2, %xmm0 609; SSE2-NEXT: pandn %xmm3, %xmm0 610; SSE2-NEXT: por %xmm8, %xmm0 611; SSE2-NEXT: movdqa %xmm1, %xmm3 612; SSE2-NEXT: psraw $15, %xmm3 613; SSE2-NEXT: pmulhuw %xmm7, %xmm3 614; SSE2-NEXT: paddw %xmm1, %xmm3 615; SSE2-NEXT: movdqa %xmm3, %xmm7 616; SSE2-NEXT: pand %xmm4, %xmm7 617; SSE2-NEXT: psraw $4, %xmm3 618; SSE2-NEXT: pandn %xmm3, %xmm4 619; SSE2-NEXT: por %xmm7, %xmm4 620; SSE2-NEXT: movdqa %xmm4, %xmm3 621; SSE2-NEXT: pand %xmm5, %xmm3 622; SSE2-NEXT: psraw $2, %xmm4 623; SSE2-NEXT: pandn %xmm4, %xmm5 624; SSE2-NEXT: por %xmm3, %xmm5 625; SSE2-NEXT: movdqa %xmm5, %xmm3 626; SSE2-NEXT: pand %xmm6, %xmm3 627; SSE2-NEXT: psraw $1, %xmm5 628; SSE2-NEXT: pandn %xmm5, %xmm6 629; SSE2-NEXT: por %xmm3, %xmm6 630; SSE2-NEXT: pand %xmm2, %xmm6 631; SSE2-NEXT: pandn %xmm1, %xmm2 632; SSE2-NEXT: por %xmm6, %xmm2 633; SSE2-NEXT: movdqa %xmm2, %xmm1 634; SSE2-NEXT: retq 635; 636; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 637; SSE41: # %bb.0: 638; SSE41-NEXT: movdqa %xmm0, %xmm2 639; SSE41-NEXT: psraw $15, %xmm2 640; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] 641; SSE41-NEXT: pmulhuw %xmm3, %xmm2 642; SSE41-NEXT: paddw %xmm0, %xmm2 643; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] 644; SSE41-NEXT: movdqa %xmm2, %xmm5 645; SSE41-NEXT: pmulhw %xmm4, %xmm5 646; SSE41-NEXT: psraw $1, %xmm2 647; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] 648; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 649; SSE41-NEXT: movdqa %xmm1, %xmm2 650; SSE41-NEXT: psraw $15, %xmm2 651; SSE41-NEXT: pmulhuw %xmm3, %xmm2 652; SSE41-NEXT: paddw %xmm1, %xmm2 653; SSE41-NEXT: pmulhw %xmm2, %xmm4 654; SSE41-NEXT: psraw $1, %xmm2 655; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7] 656; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 657; SSE41-NEXT: retq 658; 659; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 660; AVX1: # %bb.0: 661; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 662; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 663; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] 664; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 665; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 666; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,16384,32768,4096,8192,2048,1024,32768] 667; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4 668; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 669; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7] 670; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4 671; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3 672; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3 673; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2 674; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 675; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] 676; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 677; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 678; AVX1-NEXT: # ymm2 = mem[0,1,0,1] 679; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 680; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 681; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 682; AVX1-NEXT: retq 683; 684; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 685; AVX2: # %bb.0: 686; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 687; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,4,2,16,8,32,64,2,u,4,2,16,8,32,64,2] 688; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 689; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2 690; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,16384,u,4096,8192,2048,1024,u,u,16384,u,4096,8192,2048,1024,u] 691; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15] 692; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 693; AVX2-NEXT: retq 694; 695; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 696; AVX512F: # %bb.0: 697; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 698; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,4,2,16,8,32,64,2,u,4,2,16,8,32,64,2] 699; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 700; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 701; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 702; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 703; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 704; AVX512F-NEXT: retq 705; 706; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 707; AVX512BW: # %bb.0: 708; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1 709; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 710; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1 711; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 712; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 713; AVX512BW-NEXT: retq 714; 715; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 716; XOP: # %bb.0: 717; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 718; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 719; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65522,65521,65524,65523,65525,65526,65521] 720; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 721; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 722; XOP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535] 723; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 724; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 725; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 726; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 727; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 728; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 729; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 730; XOP-NEXT: # ymm2 = mem[0,1,0,1] 731; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 732; XOP-NEXT: retq 733 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 734 ret <16 x i16> %1 735} 736 737define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { 738; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 739; SSE2: # %bb.0: 740; SSE2-NEXT: movdqa %xmm1, %xmm5 741; SSE2-NEXT: movdqa %xmm0, %xmm1 742; SSE2-NEXT: psraw $15, %xmm0 743; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [u,4,2,16,8,32,64,2] 744; SSE2-NEXT: pmulhuw %xmm9, %xmm0 745; SSE2-NEXT: paddw %xmm1, %xmm0 746; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535] 747; SSE2-NEXT: movdqa %xmm0, %xmm4 748; SSE2-NEXT: pand %xmm6, %xmm4 749; SSE2-NEXT: psraw $4, %xmm0 750; SSE2-NEXT: movdqa %xmm6, %xmm8 751; SSE2-NEXT: pandn %xmm0, %xmm8 752; SSE2-NEXT: por %xmm4, %xmm8 753; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535] 754; SSE2-NEXT: movdqa %xmm8, %xmm0 755; SSE2-NEXT: pand %xmm7, %xmm0 756; SSE2-NEXT: psraw $2, %xmm8 757; SSE2-NEXT: movdqa %xmm7, %xmm4 758; SSE2-NEXT: pandn %xmm8, %xmm4 759; SSE2-NEXT: por %xmm0, %xmm4 760; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0] 761; SSE2-NEXT: movdqa %xmm4, %xmm0 762; SSE2-NEXT: pand %xmm8, %xmm0 763; SSE2-NEXT: psraw $1, %xmm4 764; SSE2-NEXT: movdqa %xmm8, %xmm10 765; SSE2-NEXT: pandn %xmm4, %xmm10 766; SSE2-NEXT: por %xmm0, %xmm10 767; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535] 768; SSE2-NEXT: pand %xmm4, %xmm10 769; SSE2-NEXT: movdqa %xmm4, %xmm0 770; SSE2-NEXT: pandn %xmm1, %xmm0 771; SSE2-NEXT: por %xmm10, %xmm0 772; SSE2-NEXT: movdqa %xmm5, %xmm1 773; SSE2-NEXT: psraw $15, %xmm1 774; SSE2-NEXT: pmulhuw %xmm9, %xmm1 775; SSE2-NEXT: paddw %xmm5, %xmm1 776; SSE2-NEXT: movdqa %xmm1, %xmm10 777; SSE2-NEXT: pand %xmm6, %xmm10 778; SSE2-NEXT: psraw $4, %xmm1 779; SSE2-NEXT: movdqa %xmm6, %xmm11 780; SSE2-NEXT: pandn %xmm1, %xmm11 781; SSE2-NEXT: por %xmm10, %xmm11 782; SSE2-NEXT: movdqa %xmm11, %xmm1 783; SSE2-NEXT: pand %xmm7, %xmm1 784; SSE2-NEXT: psraw $2, %xmm11 785; SSE2-NEXT: movdqa %xmm7, %xmm10 786; SSE2-NEXT: pandn %xmm11, %xmm10 787; SSE2-NEXT: por %xmm1, %xmm10 788; SSE2-NEXT: movdqa %xmm10, %xmm1 789; SSE2-NEXT: pand %xmm8, %xmm1 790; SSE2-NEXT: psraw $1, %xmm10 791; SSE2-NEXT: movdqa %xmm8, %xmm11 792; SSE2-NEXT: pandn %xmm10, %xmm11 793; SSE2-NEXT: por %xmm1, %xmm11 794; SSE2-NEXT: pand %xmm4, %xmm11 795; SSE2-NEXT: movdqa %xmm4, %xmm1 796; SSE2-NEXT: pandn %xmm5, %xmm1 797; SSE2-NEXT: por %xmm11, %xmm1 798; SSE2-NEXT: movdqa %xmm2, %xmm5 799; SSE2-NEXT: psraw $15, %xmm5 800; SSE2-NEXT: pmulhuw %xmm9, %xmm5 801; SSE2-NEXT: paddw %xmm2, %xmm5 802; SSE2-NEXT: movdqa %xmm5, %xmm10 803; SSE2-NEXT: pand %xmm6, %xmm10 804; SSE2-NEXT: psraw $4, %xmm5 805; SSE2-NEXT: movdqa %xmm6, %xmm11 806; SSE2-NEXT: pandn %xmm5, %xmm11 807; SSE2-NEXT: por %xmm10, %xmm11 808; SSE2-NEXT: movdqa %xmm11, %xmm5 809; SSE2-NEXT: pand %xmm7, %xmm5 810; SSE2-NEXT: psraw $2, %xmm11 811; SSE2-NEXT: movdqa %xmm7, %xmm10 812; SSE2-NEXT: pandn %xmm11, %xmm10 813; SSE2-NEXT: por %xmm5, %xmm10 814; SSE2-NEXT: movdqa %xmm10, %xmm5 815; SSE2-NEXT: pand %xmm8, %xmm5 816; SSE2-NEXT: psraw $1, %xmm10 817; SSE2-NEXT: movdqa %xmm8, %xmm11 818; SSE2-NEXT: pandn %xmm10, %xmm11 819; SSE2-NEXT: por %xmm5, %xmm11 820; SSE2-NEXT: pand %xmm4, %xmm11 821; SSE2-NEXT: movdqa %xmm4, %xmm5 822; SSE2-NEXT: pandn %xmm2, %xmm5 823; SSE2-NEXT: por %xmm11, %xmm5 824; SSE2-NEXT: movdqa %xmm3, %xmm2 825; SSE2-NEXT: psraw $15, %xmm2 826; SSE2-NEXT: pmulhuw %xmm9, %xmm2 827; SSE2-NEXT: paddw %xmm3, %xmm2 828; SSE2-NEXT: movdqa %xmm2, %xmm9 829; SSE2-NEXT: pand %xmm6, %xmm9 830; SSE2-NEXT: psraw $4, %xmm2 831; SSE2-NEXT: pandn %xmm2, %xmm6 832; SSE2-NEXT: por %xmm9, %xmm6 833; SSE2-NEXT: movdqa %xmm6, %xmm2 834; SSE2-NEXT: pand %xmm7, %xmm2 835; SSE2-NEXT: psraw $2, %xmm6 836; SSE2-NEXT: pandn %xmm6, %xmm7 837; SSE2-NEXT: por %xmm2, %xmm7 838; SSE2-NEXT: movdqa %xmm7, %xmm2 839; SSE2-NEXT: pand %xmm8, %xmm2 840; SSE2-NEXT: psraw $1, %xmm7 841; SSE2-NEXT: pandn %xmm7, %xmm8 842; SSE2-NEXT: por %xmm2, %xmm8 843; SSE2-NEXT: pand %xmm4, %xmm8 844; SSE2-NEXT: pandn %xmm3, %xmm4 845; SSE2-NEXT: por %xmm8, %xmm4 846; SSE2-NEXT: movdqa %xmm5, %xmm2 847; SSE2-NEXT: movdqa %xmm4, %xmm3 848; SSE2-NEXT: retq 849; 850; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 851; SSE41: # %bb.0: 852; SSE41-NEXT: movdqa %xmm0, %xmm6 853; SSE41-NEXT: psraw $15, %xmm6 854; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [0,4,2,16,8,32,64,2] 855; SSE41-NEXT: pmulhuw %xmm5, %xmm6 856; SSE41-NEXT: paddw %xmm0, %xmm6 857; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] 858; SSE41-NEXT: movdqa %xmm6, %xmm7 859; SSE41-NEXT: pmulhw %xmm4, %xmm7 860; SSE41-NEXT: psraw $1, %xmm6 861; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] 862; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7] 863; SSE41-NEXT: movdqa %xmm1, %xmm6 864; SSE41-NEXT: psraw $15, %xmm6 865; SSE41-NEXT: pmulhuw %xmm5, %xmm6 866; SSE41-NEXT: paddw %xmm1, %xmm6 867; SSE41-NEXT: movdqa %xmm6, %xmm7 868; SSE41-NEXT: pmulhw %xmm4, %xmm7 869; SSE41-NEXT: psraw $1, %xmm6 870; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] 871; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7] 872; SSE41-NEXT: movdqa %xmm2, %xmm6 873; SSE41-NEXT: psraw $15, %xmm6 874; SSE41-NEXT: pmulhuw %xmm5, %xmm6 875; SSE41-NEXT: paddw %xmm2, %xmm6 876; SSE41-NEXT: movdqa %xmm6, %xmm7 877; SSE41-NEXT: pmulhw %xmm4, %xmm7 878; SSE41-NEXT: psraw $1, %xmm6 879; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] 880; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7] 881; SSE41-NEXT: movdqa %xmm3, %xmm6 882; SSE41-NEXT: psraw $15, %xmm6 883; SSE41-NEXT: pmulhuw %xmm5, %xmm6 884; SSE41-NEXT: paddw %xmm3, %xmm6 885; SSE41-NEXT: pmulhw %xmm6, %xmm4 886; SSE41-NEXT: psraw $1, %xmm6 887; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] 888; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7] 889; SSE41-NEXT: retq 890; 891; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 892; AVX1: # %bb.0: 893; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 894; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 895; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,4,2,16,8,32,64,2] 896; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3 897; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 898; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,16384,32768,4096,8192,2048,1024,32768] 899; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5 900; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 901; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] 902; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5 903; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5 904; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5 905; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6 906; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5 907; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] 908; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 909; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 910; AVX1-NEXT: # ymm5 = mem[0,1,0,1] 911; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 912; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 913; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 914; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 915; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6 916; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6 917; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2 918; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6 919; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 920; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] 921; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6 922; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4 923; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4 924; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3 925; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4 926; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] 927; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 928; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 929; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1 930; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 931; AVX1-NEXT: retq 932; 933; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 934; AVX2: # %bb.0: 935; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2 936; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 937; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 938; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 939; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2 940; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768] 941; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 942; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5 943; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 944; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15] 945; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 946; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2 947; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 948; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2 949; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3 950; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 951; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15] 952; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 953; AVX2-NEXT: retq 954; 955; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 956; AVX512F: # %bb.0: 957; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 958; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 959; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 960; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 961; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 962; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 963; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 964; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 965; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 966; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 967; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 968; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5 969; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2 970; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2 971; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 972; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2 973; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 974; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 975; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 976; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 977; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) 978; AVX512F-NEXT: retq 979; 980; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 981; AVX512BW: # %bb.0: 982; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1 983; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 984; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 985; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 986; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 987; AVX512BW-NEXT: kmovd %eax, %k1 988; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} 989; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 990; AVX512BW-NEXT: retq 991; 992; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 993; XOP: # %bb.0: 994; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 995; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 996; XOP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,65522,65521,65524,65523,65525,65526,65521] 997; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 998; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 999; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535] 1000; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 1001; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 1002; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 1003; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 1004; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 1005; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 1006; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 1007; XOP-NEXT: # ymm5 = mem[0,1,0,1] 1008; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 1009; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1010; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 1011; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6 1012; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2 1013; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 1014; XOP-NEXT: vpsraw $15, %xmm1, %xmm6 1015; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4 1016; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4 1017; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3 1018; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1019; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1 1020; XOP-NEXT: retq 1021 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 1022 ret <32 x i16> %1 1023} 1024 1025define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { 1026; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1027; SSE2: # %bb.0: 1028; SSE2-NEXT: movdqa %xmm0, %xmm1 1029; SSE2-NEXT: psrad $31, %xmm1 1030; SSE2-NEXT: movdqa %xmm1, %xmm2 1031; SSE2-NEXT: psrld $28, %xmm2 1032; SSE2-NEXT: movdqa %xmm1, %xmm3 1033; SSE2-NEXT: psrld $29, %xmm3 1034; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1035; SSE2-NEXT: psrld $30, %xmm1 1036; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1037; SSE2-NEXT: paddd %xmm0, %xmm1 1038; SSE2-NEXT: movdqa %xmm1, %xmm2 1039; SSE2-NEXT: psrad $4, %xmm2 1040; SSE2-NEXT: movdqa %xmm1, %xmm3 1041; SSE2-NEXT: psrad $3, %xmm3 1042; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1043; SSE2-NEXT: psrad $2, %xmm1 1044; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1045; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1046; SSE2-NEXT: movaps %xmm1, %xmm0 1047; SSE2-NEXT: retq 1048; 1049; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1050; SSE41: # %bb.0: 1051; SSE41-NEXT: movdqa %xmm0, %xmm1 1052; SSE41-NEXT: psrad $31, %xmm1 1053; SSE41-NEXT: movdqa %xmm1, %xmm2 1054; SSE41-NEXT: psrld $28, %xmm2 1055; SSE41-NEXT: movdqa %xmm1, %xmm3 1056; SSE41-NEXT: psrld $30, %xmm3 1057; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1058; SSE41-NEXT: psrld $29, %xmm1 1059; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1060; SSE41-NEXT: paddd %xmm0, %xmm1 1061; SSE41-NEXT: movdqa %xmm1, %xmm2 1062; SSE41-NEXT: psrad $4, %xmm2 1063; SSE41-NEXT: movdqa %xmm1, %xmm3 1064; SSE41-NEXT: psrad $2, %xmm3 1065; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1066; SSE41-NEXT: psrad $3, %xmm1 1067; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1068; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1069; SSE41-NEXT: retq 1070; 1071; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1072; AVX1: # %bb.0: 1073; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 1074; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1075; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 1076; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1077; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 1078; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1079; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1080; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1081; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1082; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1083; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1084; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1085; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1086; AVX1-NEXT: retq 1087; 1088; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1089; AVX2ORLATER: # %bb.0: 1090; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 1091; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1092; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1093; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1094; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1095; AVX2ORLATER-NEXT: retq 1096; 1097; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1098; XOP: # %bb.0: 1099; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 1100; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1101; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1102; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1103; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1104; XOP-NEXT: retq 1105 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 1106 ret <4 x i32> %1 1107} 1108 1109define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { 1110; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1111; SSE2: # %bb.0: 1112; SSE2-NEXT: movdqa %xmm0, %xmm2 1113; SSE2-NEXT: psrad $31, %xmm0 1114; SSE2-NEXT: movdqa %xmm0, %xmm3 1115; SSE2-NEXT: psrld $28, %xmm3 1116; SSE2-NEXT: movdqa %xmm0, %xmm4 1117; SSE2-NEXT: psrld $29, %xmm4 1118; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1119; SSE2-NEXT: psrld $30, %xmm0 1120; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1121; SSE2-NEXT: paddd %xmm2, %xmm0 1122; SSE2-NEXT: movdqa %xmm0, %xmm3 1123; SSE2-NEXT: psrad $4, %xmm3 1124; SSE2-NEXT: movdqa %xmm0, %xmm4 1125; SSE2-NEXT: psrad $3, %xmm4 1126; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1127; SSE2-NEXT: psrad $2, %xmm0 1128; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1129; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 1130; SSE2-NEXT: movdqa %xmm1, %xmm2 1131; SSE2-NEXT: psrad $31, %xmm2 1132; SSE2-NEXT: movdqa %xmm2, %xmm3 1133; SSE2-NEXT: psrld $28, %xmm3 1134; SSE2-NEXT: movdqa %xmm2, %xmm4 1135; SSE2-NEXT: psrld $29, %xmm4 1136; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1137; SSE2-NEXT: psrld $30, %xmm2 1138; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1139; SSE2-NEXT: paddd %xmm1, %xmm2 1140; SSE2-NEXT: movdqa %xmm2, %xmm3 1141; SSE2-NEXT: psrad $4, %xmm3 1142; SSE2-NEXT: movdqa %xmm2, %xmm4 1143; SSE2-NEXT: psrad $3, %xmm4 1144; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1145; SSE2-NEXT: psrad $2, %xmm2 1146; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1147; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1148; SSE2-NEXT: movaps %xmm2, %xmm1 1149; SSE2-NEXT: retq 1150; 1151; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1152; SSE41: # %bb.0: 1153; SSE41-NEXT: movdqa %xmm0, %xmm2 1154; SSE41-NEXT: psrad $31, %xmm2 1155; SSE41-NEXT: movdqa %xmm2, %xmm3 1156; SSE41-NEXT: psrld $28, %xmm3 1157; SSE41-NEXT: movdqa %xmm2, %xmm4 1158; SSE41-NEXT: psrld $30, %xmm4 1159; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1160; SSE41-NEXT: psrld $29, %xmm2 1161; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1162; SSE41-NEXT: paddd %xmm0, %xmm2 1163; SSE41-NEXT: movdqa %xmm2, %xmm3 1164; SSE41-NEXT: psrad $4, %xmm3 1165; SSE41-NEXT: movdqa %xmm2, %xmm4 1166; SSE41-NEXT: psrad $2, %xmm4 1167; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1168; SSE41-NEXT: psrad $3, %xmm2 1169; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1170; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 1171; SSE41-NEXT: movdqa %xmm1, %xmm2 1172; SSE41-NEXT: psrad $31, %xmm2 1173; SSE41-NEXT: movdqa %xmm2, %xmm3 1174; SSE41-NEXT: psrld $28, %xmm3 1175; SSE41-NEXT: movdqa %xmm2, %xmm4 1176; SSE41-NEXT: psrld $30, %xmm4 1177; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1178; SSE41-NEXT: psrld $29, %xmm2 1179; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1180; SSE41-NEXT: paddd %xmm1, %xmm2 1181; SSE41-NEXT: movdqa %xmm2, %xmm3 1182; SSE41-NEXT: psrad $4, %xmm3 1183; SSE41-NEXT: movdqa %xmm2, %xmm4 1184; SSE41-NEXT: psrad $2, %xmm4 1185; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1186; SSE41-NEXT: psrad $3, %xmm2 1187; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1188; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 1189; SSE41-NEXT: retq 1190; 1191; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1192; AVX1: # %bb.0: 1193; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1194; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1195; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1196; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1197; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1198; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1199; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1200; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1201; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1202; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1203; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1204; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1205; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1206; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 1207; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1208; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1209; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1210; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1211; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1212; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 1213; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1214; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1215; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1216; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1217; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1218; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1219; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1220; AVX1-NEXT: retq 1221; 1222; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1223; AVX2ORLATER: # %bb.0: 1224; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1 1225; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1226; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1 1227; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1228; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1229; AVX2ORLATER-NEXT: retq 1230; 1231; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1232; XOP: # %bb.0: 1233; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1234; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 1235; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967266,4294967267,4294967268] 1236; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 1237; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1238; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] 1239; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 1240; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 1241; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 1242; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1243; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2 1244; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1245; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1246; XOP-NEXT: retq 1247 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1248 ret <8 x i32> %1 1249} 1250 1251define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { 1252; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1253; SSE2: # %bb.0: 1254; SSE2-NEXT: movdqa %xmm1, %xmm4 1255; SSE2-NEXT: movdqa %xmm0, %xmm1 1256; SSE2-NEXT: psrad $31, %xmm0 1257; SSE2-NEXT: movdqa %xmm0, %xmm5 1258; SSE2-NEXT: psrld $28, %xmm5 1259; SSE2-NEXT: movdqa %xmm0, %xmm6 1260; SSE2-NEXT: psrld $29, %xmm6 1261; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1262; SSE2-NEXT: psrld $30, %xmm0 1263; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1264; SSE2-NEXT: paddd %xmm1, %xmm0 1265; SSE2-NEXT: movdqa %xmm0, %xmm5 1266; SSE2-NEXT: psrad $4, %xmm5 1267; SSE2-NEXT: movdqa %xmm0, %xmm6 1268; SSE2-NEXT: psrad $3, %xmm6 1269; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1270; SSE2-NEXT: psrad $2, %xmm0 1271; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1272; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1273; SSE2-NEXT: movdqa %xmm4, %xmm1 1274; SSE2-NEXT: psrad $31, %xmm1 1275; SSE2-NEXT: movdqa %xmm1, %xmm5 1276; SSE2-NEXT: psrld $28, %xmm5 1277; SSE2-NEXT: movdqa %xmm1, %xmm6 1278; SSE2-NEXT: psrld $29, %xmm6 1279; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1280; SSE2-NEXT: psrld $30, %xmm1 1281; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1282; SSE2-NEXT: paddd %xmm4, %xmm1 1283; SSE2-NEXT: movdqa %xmm1, %xmm5 1284; SSE2-NEXT: psrad $4, %xmm5 1285; SSE2-NEXT: movdqa %xmm1, %xmm6 1286; SSE2-NEXT: psrad $3, %xmm6 1287; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1288; SSE2-NEXT: psrad $2, %xmm1 1289; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1290; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] 1291; SSE2-NEXT: movdqa %xmm2, %xmm4 1292; SSE2-NEXT: psrad $31, %xmm4 1293; SSE2-NEXT: movdqa %xmm4, %xmm5 1294; SSE2-NEXT: psrld $28, %xmm5 1295; SSE2-NEXT: movdqa %xmm4, %xmm6 1296; SSE2-NEXT: psrld $29, %xmm6 1297; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1298; SSE2-NEXT: psrld $30, %xmm4 1299; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1300; SSE2-NEXT: paddd %xmm2, %xmm4 1301; SSE2-NEXT: movdqa %xmm4, %xmm5 1302; SSE2-NEXT: psrad $4, %xmm5 1303; SSE2-NEXT: movdqa %xmm4, %xmm6 1304; SSE2-NEXT: psrad $3, %xmm6 1305; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1306; SSE2-NEXT: psrad $2, %xmm4 1307; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1308; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] 1309; SSE2-NEXT: movdqa %xmm3, %xmm5 1310; SSE2-NEXT: psrad $31, %xmm5 1311; SSE2-NEXT: movdqa %xmm5, %xmm2 1312; SSE2-NEXT: psrld $28, %xmm2 1313; SSE2-NEXT: movdqa %xmm5, %xmm6 1314; SSE2-NEXT: psrld $29, %xmm6 1315; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1316; SSE2-NEXT: psrld $30, %xmm5 1317; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1318; SSE2-NEXT: paddd %xmm3, %xmm5 1319; SSE2-NEXT: movdqa %xmm5, %xmm2 1320; SSE2-NEXT: psrad $4, %xmm2 1321; SSE2-NEXT: movdqa %xmm5, %xmm6 1322; SSE2-NEXT: psrad $3, %xmm6 1323; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1324; SSE2-NEXT: psrad $2, %xmm5 1325; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1326; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 1327; SSE2-NEXT: movaps %xmm4, %xmm2 1328; SSE2-NEXT: movaps %xmm5, %xmm3 1329; SSE2-NEXT: retq 1330; 1331; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1332; SSE41: # %bb.0: 1333; SSE41-NEXT: movdqa %xmm0, %xmm4 1334; SSE41-NEXT: psrad $31, %xmm4 1335; SSE41-NEXT: movdqa %xmm4, %xmm5 1336; SSE41-NEXT: psrld $28, %xmm5 1337; SSE41-NEXT: movdqa %xmm4, %xmm6 1338; SSE41-NEXT: psrld $30, %xmm6 1339; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1340; SSE41-NEXT: psrld $29, %xmm4 1341; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1342; SSE41-NEXT: paddd %xmm0, %xmm4 1343; SSE41-NEXT: movdqa %xmm4, %xmm5 1344; SSE41-NEXT: psrad $4, %xmm5 1345; SSE41-NEXT: movdqa %xmm4, %xmm6 1346; SSE41-NEXT: psrad $2, %xmm6 1347; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1348; SSE41-NEXT: psrad $3, %xmm4 1349; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1350; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7] 1351; SSE41-NEXT: movdqa %xmm1, %xmm4 1352; SSE41-NEXT: psrad $31, %xmm4 1353; SSE41-NEXT: movdqa %xmm4, %xmm5 1354; SSE41-NEXT: psrld $28, %xmm5 1355; SSE41-NEXT: movdqa %xmm4, %xmm6 1356; SSE41-NEXT: psrld $30, %xmm6 1357; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1358; SSE41-NEXT: psrld $29, %xmm4 1359; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1360; SSE41-NEXT: paddd %xmm1, %xmm4 1361; SSE41-NEXT: movdqa %xmm4, %xmm5 1362; SSE41-NEXT: psrad $4, %xmm5 1363; SSE41-NEXT: movdqa %xmm4, %xmm6 1364; SSE41-NEXT: psrad $2, %xmm6 1365; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1366; SSE41-NEXT: psrad $3, %xmm4 1367; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1368; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7] 1369; SSE41-NEXT: movdqa %xmm2, %xmm4 1370; SSE41-NEXT: psrad $31, %xmm4 1371; SSE41-NEXT: movdqa %xmm4, %xmm5 1372; SSE41-NEXT: psrld $28, %xmm5 1373; SSE41-NEXT: movdqa %xmm4, %xmm6 1374; SSE41-NEXT: psrld $30, %xmm6 1375; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1376; SSE41-NEXT: psrld $29, %xmm4 1377; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1378; SSE41-NEXT: paddd %xmm2, %xmm4 1379; SSE41-NEXT: movdqa %xmm4, %xmm5 1380; SSE41-NEXT: psrad $4, %xmm5 1381; SSE41-NEXT: movdqa %xmm4, %xmm6 1382; SSE41-NEXT: psrad $2, %xmm6 1383; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1384; SSE41-NEXT: psrad $3, %xmm4 1385; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1386; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] 1387; SSE41-NEXT: movdqa %xmm3, %xmm4 1388; SSE41-NEXT: psrad $31, %xmm4 1389; SSE41-NEXT: movdqa %xmm4, %xmm5 1390; SSE41-NEXT: psrld $28, %xmm5 1391; SSE41-NEXT: movdqa %xmm4, %xmm6 1392; SSE41-NEXT: psrld $30, %xmm6 1393; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1394; SSE41-NEXT: psrld $29, %xmm4 1395; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1396; SSE41-NEXT: paddd %xmm3, %xmm4 1397; SSE41-NEXT: movdqa %xmm4, %xmm5 1398; SSE41-NEXT: psrad $4, %xmm5 1399; SSE41-NEXT: movdqa %xmm4, %xmm6 1400; SSE41-NEXT: psrad $2, %xmm6 1401; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1402; SSE41-NEXT: psrad $3, %xmm4 1403; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1404; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] 1405; SSE41-NEXT: retq 1406; 1407; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1408; AVX1: # %bb.0: 1409; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1410; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1411; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1412; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1413; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1414; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1415; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1416; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1417; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1418; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1419; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1420; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1421; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1422; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 1423; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1424; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1425; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1426; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1427; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1428; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1429; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1430; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1431; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1432; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1433; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1434; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1435; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1436; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1437; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1438; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1439; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1440; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1441; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1442; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1443; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1444; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1445; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1446; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1447; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1448; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1449; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 1450; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1451; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1452; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1453; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1454; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1455; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 1456; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1457; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1458; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1459; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1460; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1461; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1462; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1463; AVX1-NEXT: retq 1464; 1465; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1466; AVX2: # %bb.0: 1467; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 1468; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28] 1469; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1470; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1471; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 1472; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4] 1473; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1474; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1475; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1476; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 1477; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1478; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2 1479; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1480; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1481; AVX2-NEXT: retq 1482; 1483; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1484; AVX512F: # %bb.0: 1485; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1 1486; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1487; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1488; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1489; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 1490; AVX512F-NEXT: kmovw %eax, %k1 1491; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1492; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1493; AVX512F-NEXT: retq 1494; 1495; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1496; AVX512BW: # %bb.0: 1497; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1 1498; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1499; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1500; AVX512BW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1501; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111 1502; AVX512BW-NEXT: kmovd %eax, %k1 1503; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1504; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1505; AVX512BW-NEXT: retq 1506; 1507; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1508; XOP: # %bb.0: 1509; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1510; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 1511; XOP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,4294967266,4294967267,4294967268] 1512; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 1513; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1514; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292] 1515; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1516; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 1517; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1518; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5 1519; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5 1520; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 1521; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1522; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1523; XOP-NEXT: vpsrad $31, %xmm2, %xmm5 1524; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1525; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1526; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1527; XOP-NEXT: vpsrad $31, %xmm1, %xmm5 1528; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4 1529; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4 1530; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3 1531; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1532; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1533; XOP-NEXT: retq 1534 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1535 ret <16 x i32> %1 1536} 1537 1538define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { 1539; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1540; SSE2: # %bb.0: 1541; SSE2-NEXT: movdqa %xmm0, %xmm1 1542; SSE2-NEXT: psrad $31, %xmm1 1543; SSE2-NEXT: psrlq $62, %xmm1 1544; SSE2-NEXT: paddq %xmm0, %xmm1 1545; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1546; SSE2-NEXT: psrad $2, %xmm2 1547; SSE2-NEXT: psrlq $2, %xmm1 1548; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1549; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1550; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1551; SSE2-NEXT: retq 1552; 1553; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1554; SSE41: # %bb.0: 1555; SSE41-NEXT: movdqa %xmm0, %xmm1 1556; SSE41-NEXT: psrad $31, %xmm1 1557; SSE41-NEXT: psrlq $62, %xmm1 1558; SSE41-NEXT: paddq %xmm0, %xmm1 1559; SSE41-NEXT: movdqa %xmm1, %xmm2 1560; SSE41-NEXT: psrad $2, %xmm2 1561; SSE41-NEXT: psrlq $2, %xmm1 1562; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1563; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1564; SSE41-NEXT: retq 1565; 1566; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1567; AVX1: # %bb.0: 1568; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1569; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1570; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1 1571; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1572; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2 1573; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 1574; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1575; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1576; AVX1-NEXT: retq 1577; 1578; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1579; AVX2: # %bb.0: 1580; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1581; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1582; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1 1583; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1584; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2 1585; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1 1586; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 1587; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1588; AVX2-NEXT: retq 1589; 1590; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1591; AVX512F: # %bb.0: 1592; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1593; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1594; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1 1595; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1596; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1 1597; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1598; AVX512F-NEXT: vzeroupper 1599; AVX512F-NEXT: retq 1600; 1601; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1602; AVX512BW: # %bb.0: 1603; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1 1604; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1 1605; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1606; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1 1607; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1608; AVX512BW-NEXT: retq 1609; 1610; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1611; XOP: # %bb.0: 1612; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1613; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1 1614; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1615; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1616; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1617; XOP-NEXT: retq 1618 %1 = sdiv <2 x i64> %x, <i64 1, i64 4> 1619 ret <2 x i64> %1 1620} 1621 1622define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { 1623; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1624; SSE2: # %bb.0: 1625; SSE2-NEXT: movdqa %xmm0, %xmm2 1626; SSE2-NEXT: psrad $31, %xmm2 1627; SSE2-NEXT: psrlq $62, %xmm2 1628; SSE2-NEXT: paddq %xmm0, %xmm2 1629; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 1630; SSE2-NEXT: psrad $2, %xmm3 1631; SSE2-NEXT: psrlq $2, %xmm2 1632; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1633; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1634; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 1635; SSE2-NEXT: movdqa %xmm1, %xmm2 1636; SSE2-NEXT: psrad $31, %xmm2 1637; SSE2-NEXT: movdqa %xmm2, %xmm3 1638; SSE2-NEXT: psrlq $61, %xmm3 1639; SSE2-NEXT: psrlq $60, %xmm2 1640; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 1641; SSE2-NEXT: paddq %xmm2, %xmm1 1642; SSE2-NEXT: movdqa %xmm1, %xmm2 1643; SSE2-NEXT: psrlq $3, %xmm2 1644; SSE2-NEXT: psrlq $4, %xmm1 1645; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1646; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488] 1647; SSE2-NEXT: xorpd %xmm2, %xmm1 1648; SSE2-NEXT: psubq %xmm2, %xmm1 1649; SSE2-NEXT: retq 1650; 1651; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1652; SSE41: # %bb.0: 1653; SSE41-NEXT: movdqa %xmm0, %xmm2 1654; SSE41-NEXT: psrad $31, %xmm2 1655; SSE41-NEXT: psrlq $62, %xmm2 1656; SSE41-NEXT: paddq %xmm0, %xmm2 1657; SSE41-NEXT: movdqa %xmm2, %xmm3 1658; SSE41-NEXT: psrad $2, %xmm3 1659; SSE41-NEXT: psrlq $2, %xmm2 1660; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1661; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1662; SSE41-NEXT: movdqa %xmm1, %xmm2 1663; SSE41-NEXT: psrad $31, %xmm2 1664; SSE41-NEXT: movdqa %xmm2, %xmm3 1665; SSE41-NEXT: psrlq $60, %xmm3 1666; SSE41-NEXT: psrlq $61, %xmm2 1667; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1668; SSE41-NEXT: paddq %xmm2, %xmm1 1669; SSE41-NEXT: movdqa %xmm1, %xmm2 1670; SSE41-NEXT: psrlq $4, %xmm2 1671; SSE41-NEXT: psrlq $3, %xmm1 1672; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1673; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488] 1674; SSE41-NEXT: pxor %xmm2, %xmm1 1675; SSE41-NEXT: psubq %xmm2, %xmm1 1676; SSE41-NEXT: retq 1677; 1678; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1679; AVX1: # %bb.0: 1680; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1681; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1682; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 1683; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4 1684; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3 1685; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1686; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1687; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 1688; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1 1689; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1690; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488] 1691; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1692; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 1693; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 1694; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1695; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1696; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3 1697; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1698; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1699; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1700; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1701; AVX1-NEXT: retq 1702; 1703; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1704; AVX2: # %bb.0: 1705; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1706; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 1707; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1708; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1709; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1710; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,2305843009213693952,1152921504606846976,576460752303423488] 1711; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 1712; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1713; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1714; AVX2-NEXT: retq 1715; 1716; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1717; AVX512F: # %bb.0: 1718; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1719; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,3,4] 1720; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 1721; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1722; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 1723; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1 1724; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1725; AVX512F-NEXT: retq 1726; 1727; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1728; AVX512BW: # %bb.0: 1729; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 1730; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1731; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1732; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1733; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1734; AVX512BW-NEXT: retq 1735; 1736; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1737; XOP: # %bb.0: 1738; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] 1739; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 1740; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 1741; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1742; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1743; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1744; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1 1745; XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1746; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1747; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1748; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1749; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1750; XOP-NEXT: retq 1751 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16> 1752 ret <4 x i64> %1 1753} 1754 1755define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { 1756; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1757; SSE2: # %bb.0: 1758; SSE2-NEXT: movdqa %xmm0, %xmm4 1759; SSE2-NEXT: psrad $31, %xmm4 1760; SSE2-NEXT: psrlq $62, %xmm4 1761; SSE2-NEXT: paddq %xmm0, %xmm4 1762; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1763; SSE2-NEXT: psrad $2, %xmm5 1764; SSE2-NEXT: psrlq $2, %xmm4 1765; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1766; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1767; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] 1768; SSE2-NEXT: movdqa %xmm2, %xmm4 1769; SSE2-NEXT: psrad $31, %xmm4 1770; SSE2-NEXT: psrlq $62, %xmm4 1771; SSE2-NEXT: paddq %xmm2, %xmm4 1772; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1773; SSE2-NEXT: psrad $2, %xmm5 1774; SSE2-NEXT: psrlq $2, %xmm4 1775; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1776; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1777; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 1778; SSE2-NEXT: movdqa %xmm1, %xmm4 1779; SSE2-NEXT: psrad $31, %xmm4 1780; SSE2-NEXT: movdqa %xmm4, %xmm5 1781; SSE2-NEXT: psrlq $61, %xmm5 1782; SSE2-NEXT: psrlq $60, %xmm4 1783; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 1784; SSE2-NEXT: paddq %xmm4, %xmm1 1785; SSE2-NEXT: movdqa %xmm1, %xmm4 1786; SSE2-NEXT: psrlq $3, %xmm4 1787; SSE2-NEXT: psrlq $4, %xmm1 1788; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 1789; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1790; SSE2-NEXT: xorpd %xmm4, %xmm1 1791; SSE2-NEXT: psubq %xmm4, %xmm1 1792; SSE2-NEXT: movdqa %xmm3, %xmm5 1793; SSE2-NEXT: psrad $31, %xmm5 1794; SSE2-NEXT: movdqa %xmm5, %xmm6 1795; SSE2-NEXT: psrlq $61, %xmm6 1796; SSE2-NEXT: psrlq $60, %xmm5 1797; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 1798; SSE2-NEXT: paddq %xmm5, %xmm3 1799; SSE2-NEXT: movdqa %xmm3, %xmm5 1800; SSE2-NEXT: psrlq $3, %xmm5 1801; SSE2-NEXT: psrlq $4, %xmm3 1802; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] 1803; SSE2-NEXT: xorpd %xmm4, %xmm3 1804; SSE2-NEXT: psubq %xmm4, %xmm3 1805; SSE2-NEXT: retq 1806; 1807; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1808; SSE41: # %bb.0: 1809; SSE41-NEXT: movdqa %xmm0, %xmm4 1810; SSE41-NEXT: psrad $31, %xmm4 1811; SSE41-NEXT: psrlq $62, %xmm4 1812; SSE41-NEXT: paddq %xmm0, %xmm4 1813; SSE41-NEXT: movdqa %xmm4, %xmm5 1814; SSE41-NEXT: psrad $2, %xmm5 1815; SSE41-NEXT: psrlq $2, %xmm4 1816; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1817; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 1818; SSE41-NEXT: movdqa %xmm2, %xmm4 1819; SSE41-NEXT: psrad $31, %xmm4 1820; SSE41-NEXT: psrlq $62, %xmm4 1821; SSE41-NEXT: paddq %xmm2, %xmm4 1822; SSE41-NEXT: movdqa %xmm4, %xmm5 1823; SSE41-NEXT: psrad $2, %xmm5 1824; SSE41-NEXT: psrlq $2, %xmm4 1825; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1826; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1827; SSE41-NEXT: movdqa %xmm1, %xmm4 1828; SSE41-NEXT: psrad $31, %xmm4 1829; SSE41-NEXT: movdqa %xmm4, %xmm5 1830; SSE41-NEXT: psrlq $60, %xmm5 1831; SSE41-NEXT: psrlq $61, %xmm4 1832; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] 1833; SSE41-NEXT: paddq %xmm4, %xmm1 1834; SSE41-NEXT: movdqa %xmm1, %xmm4 1835; SSE41-NEXT: psrlq $4, %xmm4 1836; SSE41-NEXT: psrlq $3, %xmm1 1837; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 1838; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1839; SSE41-NEXT: pxor %xmm4, %xmm1 1840; SSE41-NEXT: psubq %xmm4, %xmm1 1841; SSE41-NEXT: movdqa %xmm3, %xmm5 1842; SSE41-NEXT: psrad $31, %xmm5 1843; SSE41-NEXT: movdqa %xmm5, %xmm6 1844; SSE41-NEXT: psrlq $60, %xmm6 1845; SSE41-NEXT: psrlq $61, %xmm5 1846; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1847; SSE41-NEXT: paddq %xmm5, %xmm3 1848; SSE41-NEXT: movdqa %xmm3, %xmm5 1849; SSE41-NEXT: psrlq $4, %xmm5 1850; SSE41-NEXT: psrlq $3, %xmm3 1851; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1852; SSE41-NEXT: pxor %xmm4, %xmm3 1853; SSE41-NEXT: psubq %xmm4, %xmm3 1854; SSE41-NEXT: retq 1855; 1856; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1857; AVX1: # %bb.0: 1858; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1859; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1860; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 1861; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5 1862; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4 1863; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] 1864; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1865; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4 1866; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1867; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1868; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1869; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1870; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1871; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 1872; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 1873; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 1874; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6 1875; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5 1876; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1877; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1878; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1879; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1880; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 1881; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6 1882; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 1883; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1884; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1885; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 1886; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1887; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1888; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1889; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1890; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 1891; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1892; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 1893; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1894; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1895; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1896; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1897; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1898; AVX1-NEXT: retq 1899; 1900; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1901; AVX2: # %bb.0: 1902; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1903; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 1904; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,62,61,60] 1905; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 1906; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 1907; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,2,3,4] 1908; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 1909; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,2305843009213693952,1152921504606846976,576460752303423488] 1910; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 1911; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 1912; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1913; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 1914; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2 1915; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2 1916; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2 1917; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 1918; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1919; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1920; AVX2-NEXT: retq 1921; 1922; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1923; AVX512F: # %bb.0: 1924; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1925; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1926; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1927; AVX512F-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1928; AVX512F-NEXT: movb $17, %al 1929; AVX512F-NEXT: kmovw %eax, %k1 1930; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1931; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1932; AVX512F-NEXT: retq 1933; 1934; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1935; AVX512BW: # %bb.0: 1936; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1 1937; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1938; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1939; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1940; AVX512BW-NEXT: movb $17, %al 1941; AVX512BW-NEXT: kmovd %eax, %k1 1942; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1943; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1944; AVX512BW-NEXT: retq 1945; 1946; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1947; XOP: # %bb.0: 1948; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1949; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] 1950; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 1951; XOP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] 1952; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 1953; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 1954; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] 1955; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1956; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 1957; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 1958; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 1959; XOP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] 1960; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 1961; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 1962; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 1963; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1964; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6 1965; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5 1966; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1967; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1968; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3 1969; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3 1970; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3 1971; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3 1972; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1973; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1974; XOP-NEXT: retq 1975 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16> 1976 ret <8 x i64> %1 1977} 1978 1979define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { 1980; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1981; SSE2: # %bb.0: 1982; SSE2-NEXT: movdqa %xmm0, %xmm1 1983; SSE2-NEXT: psrad $31, %xmm1 1984; SSE2-NEXT: movdqa %xmm1, %xmm2 1985; SSE2-NEXT: psrld $28, %xmm2 1986; SSE2-NEXT: movdqa %xmm1, %xmm3 1987; SSE2-NEXT: psrld $29, %xmm3 1988; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1989; SSE2-NEXT: psrld $30, %xmm1 1990; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1991; SSE2-NEXT: paddd %xmm0, %xmm1 1992; SSE2-NEXT: movdqa %xmm1, %xmm2 1993; SSE2-NEXT: psrad $4, %xmm2 1994; SSE2-NEXT: movdqa %xmm1, %xmm3 1995; SSE2-NEXT: psrad $3, %xmm3 1996; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1997; SSE2-NEXT: psrad $2, %xmm1 1998; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1999; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2000; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 2001; SSE2-NEXT: pxor %xmm2, %xmm2 2002; SSE2-NEXT: psubd %xmm1, %xmm2 2003; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] 2004; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2005; SSE2-NEXT: retq 2006; 2007; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2008; SSE41: # %bb.0: 2009; SSE41-NEXT: movdqa %xmm0, %xmm1 2010; SSE41-NEXT: psrad $31, %xmm1 2011; SSE41-NEXT: movdqa %xmm1, %xmm2 2012; SSE41-NEXT: psrld $28, %xmm2 2013; SSE41-NEXT: movdqa %xmm1, %xmm3 2014; SSE41-NEXT: psrld $30, %xmm3 2015; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2016; SSE41-NEXT: psrld $29, %xmm1 2017; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 2018; SSE41-NEXT: paddd %xmm0, %xmm1 2019; SSE41-NEXT: movdqa %xmm1, %xmm2 2020; SSE41-NEXT: psrad $4, %xmm2 2021; SSE41-NEXT: movdqa %xmm1, %xmm3 2022; SSE41-NEXT: psrad $2, %xmm3 2023; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2024; SSE41-NEXT: pxor %xmm2, %xmm2 2025; SSE41-NEXT: psubd %xmm3, %xmm2 2026; SSE41-NEXT: psrad $3, %xmm1 2027; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2028; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2029; SSE41-NEXT: retq 2030; 2031; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2032; AVX1: # %bb.0: 2033; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 2034; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 2035; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 2036; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2037; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 2038; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2039; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2040; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 2041; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 2042; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2043; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2044; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 2045; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 2046; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2047; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2048; AVX1-NEXT: retq 2049; 2050; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2051; AVX2ORLATER: # %bb.0: 2052; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 2053; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2054; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2055; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2056; AVX2ORLATER-NEXT: vpxor %xmm2, %xmm2, %xmm2 2057; AVX2ORLATER-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2058; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2059; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 2060; AVX2ORLATER-NEXT: retq 2061; 2062; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2063; XOP: # %bb.0: 2064; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 2065; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2066; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2067; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2068; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2069; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2070; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2071; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2072; XOP-NEXT: retq 2073 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16> 2074 ret <4 x i32> %1 2075} 2076 2077define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { 2078; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1: 2079; CHECK: # %bb.0: 2080; CHECK-NEXT: retq 2081 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16> 2082 ret <4 x i32> %1 2083} 2084 2085define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { 2086; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2: 2087; CHECK: # %bb.0: 2088; CHECK-NEXT: retq 2089 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16> 2090 ret <4 x i32> %1 2091} 2092 2093define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { 2094; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3: 2095; CHECK: # %bb.0: 2096; CHECK-NEXT: retq 2097 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16> 2098 ret <4 x i32> %1 2099} 2100 2101; PR37119 2102define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { 2103; SSE-LABEL: non_splat_minus_one_divisor_0: 2104; SSE: # %bb.0: 2105; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2106; SSE-NEXT: pxor %xmm1, %xmm0 2107; SSE-NEXT: psubb %xmm1, %xmm0 2108; SSE-NEXT: retq 2109; 2110; AVX1-LABEL: non_splat_minus_one_divisor_0: 2111; AVX1: # %bb.0: 2112; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2113; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2114; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2115; AVX1-NEXT: retq 2116; 2117; AVX2-LABEL: non_splat_minus_one_divisor_0: 2118; AVX2: # %bb.0: 2119; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2120; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2121; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2122; AVX2-NEXT: retq 2123; 2124; AVX512F-LABEL: non_splat_minus_one_divisor_0: 2125; AVX512F: # %bb.0: 2126; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2127; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2128; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2129; AVX512F-NEXT: retq 2130; 2131; AVX512BW-LABEL: non_splat_minus_one_divisor_0: 2132; AVX512BW: # %bb.0: 2133; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2134; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2135; AVX512BW-NEXT: kmovd %eax, %k1 2136; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} 2137; AVX512BW-NEXT: retq 2138; 2139; XOP-LABEL: non_splat_minus_one_divisor_0: 2140; XOP: # %bb.0: 2141; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2142; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2143; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2144; XOP-NEXT: retq 2145 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2146 ret <16 x i8> %div 2147} 2148 2149define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { 2150; SSE2-LABEL: non_splat_minus_one_divisor_1: 2151; SSE2: # %bb.0: 2152; SSE2-NEXT: pxor %xmm1, %xmm1 2153; SSE2-NEXT: pxor %xmm2, %xmm2 2154; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 2155; SSE2-NEXT: movdqa %xmm2, %xmm3 2156; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 2157; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] 2158; SSE2-NEXT: psrlw $8, %xmm3 2159; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2160; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,256,2,256,256,256,2,256] 2161; SSE2-NEXT: psrlw $8, %xmm2 2162; SSE2-NEXT: packuswb %xmm3, %xmm2 2163; SSE2-NEXT: paddb %xmm0, %xmm2 2164; SSE2-NEXT: movdqa %xmm2, %xmm1 2165; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2166; SSE2-NEXT: psraw $8, %xmm1 2167; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,128,128,128,128,2,128,2] 2168; SSE2-NEXT: psrlw $8, %xmm1 2169; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2170; SSE2-NEXT: psraw $8, %xmm2 2171; SSE2-NEXT: psllw $7, %xmm2 2172; SSE2-NEXT: psrlw $8, %xmm2 2173; SSE2-NEXT: packuswb %xmm1, %xmm2 2174; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2175; SSE2-NEXT: pand %xmm1, %xmm2 2176; SSE2-NEXT: pandn %xmm0, %xmm1 2177; SSE2-NEXT: por %xmm2, %xmm1 2178; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2179; SSE2-NEXT: pxor %xmm0, %xmm1 2180; SSE2-NEXT: psubb %xmm0, %xmm1 2181; SSE2-NEXT: movdqa %xmm1, %xmm0 2182; SSE2-NEXT: retq 2183; 2184; SSE41-LABEL: non_splat_minus_one_divisor_1: 2185; SSE41: # %bb.0: 2186; SSE41-NEXT: movdqa %xmm0, %xmm1 2187; SSE41-NEXT: pxor %xmm0, %xmm0 2188; SSE41-NEXT: pxor %xmm3, %xmm3 2189; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 2190; SSE41-NEXT: pxor %xmm4, %xmm4 2191; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 2192; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2193; SSE41-NEXT: paddw %xmm2, %xmm2 2194; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] 2195; SSE41-NEXT: psrlw $8, %xmm2 2196; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2197; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] 2198; SSE41-NEXT: psrlw $8, %xmm3 2199; SSE41-NEXT: packuswb %xmm3, %xmm2 2200; SSE41-NEXT: paddb %xmm1, %xmm2 2201; SSE41-NEXT: movdqa %xmm2, %xmm0 2202; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2203; SSE41-NEXT: psraw $8, %xmm0 2204; SSE41-NEXT: movdqa %xmm0, %xmm3 2205; SSE41-NEXT: psllw $7, %xmm3 2206; SSE41-NEXT: paddw %xmm0, %xmm0 2207; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7] 2208; SSE41-NEXT: psrlw $8, %xmm0 2209; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2210; SSE41-NEXT: psraw $8, %xmm2 2211; SSE41-NEXT: psllw $7, %xmm2 2212; SSE41-NEXT: psrlw $8, %xmm2 2213; SSE41-NEXT: packuswb %xmm0, %xmm2 2214; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2215; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 2216; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2217; SSE41-NEXT: pxor %xmm0, %xmm1 2218; SSE41-NEXT: psubb %xmm0, %xmm1 2219; SSE41-NEXT: movdqa %xmm1, %xmm0 2220; SSE41-NEXT: retq 2221; 2222; AVX1-LABEL: non_splat_minus_one_divisor_1: 2223; AVX1: # %bb.0: 2224; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2225; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2226; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2227; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2228; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 2229; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] 2230; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2231; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2232; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128] 2233; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2234; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 2235; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2236; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2237; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 2238; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 2239; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 2240; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7] 2241; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2242; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2243; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 2244; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 2245; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2246; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2247; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2248; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2249; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2250; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2251; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2252; AVX1-NEXT: retq 2253; 2254; AVX2-LABEL: non_splat_minus_one_divisor_1: 2255; AVX2: # %bb.0: 2256; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2257; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2258; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2259; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128] 2260; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2261; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2262; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2263; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2264; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2265; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2] 2266; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2267; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2268; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2269; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2270; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2271; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2272; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2273; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2274; AVX2-NEXT: vzeroupper 2275; AVX2-NEXT: retq 2276; 2277; AVX512F-LABEL: non_splat_minus_one_divisor_1: 2278; AVX512F: # %bb.0: 2279; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2280; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2281; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 2282; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2283; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2284; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2285; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 2286; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2287; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2288; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2289; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2290; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2291; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2292; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2293; AVX512F-NEXT: vzeroupper 2294; AVX512F-NEXT: retq 2295; 2296; AVX512BW-LABEL: non_splat_minus_one_divisor_1: 2297; AVX512BW: # %bb.0: 2298; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2299; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2300; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2301; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2302; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2303; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2 2304; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2 2305; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2306; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2307; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2308; AVX512BW-NEXT: kmovd %eax, %k1 2309; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} 2310; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 2311; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44 2312; AVX512BW-NEXT: kmovd %eax, %k1 2313; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} 2314; AVX512BW-NEXT: vzeroupper 2315; AVX512BW-NEXT: retq 2316; 2317; XOP-LABEL: non_splat_minus_one_divisor_1: 2318; XOP: # %bb.0: 2319; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2320; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2321; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2322; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2323; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2324; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2325; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2326; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2327; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2328; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2329; XOP-NEXT: retq 2330 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128> 2331 ret <16 x i8> %div 2332} 2333 2334define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { 2335; SSE2-LABEL: non_splat_minus_one_divisor_2: 2336; SSE2: # %bb.0: 2337; SSE2-NEXT: movdqa %xmm0, %xmm1 2338; SSE2-NEXT: psrld $31, %xmm1 2339; SSE2-NEXT: paddd %xmm0, %xmm1 2340; SSE2-NEXT: psrad $1, %xmm1 2341; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2342; SSE2-NEXT: pxor %xmm0, %xmm0 2343; SSE2-NEXT: psubd %xmm1, %xmm0 2344; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 2345; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 2346; SSE2-NEXT: retq 2347; 2348; SSE41-LABEL: non_splat_minus_one_divisor_2: 2349; SSE41: # %bb.0: 2350; SSE41-NEXT: movdqa %xmm0, %xmm1 2351; SSE41-NEXT: psrld $31, %xmm1 2352; SSE41-NEXT: paddd %xmm0, %xmm1 2353; SSE41-NEXT: psrad $1, %xmm1 2354; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2355; SSE41-NEXT: pxor %xmm1, %xmm1 2356; SSE41-NEXT: psubd %xmm0, %xmm1 2357; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2358; SSE41-NEXT: retq 2359; 2360; AVX1-LABEL: non_splat_minus_one_divisor_2: 2361; AVX1: # %bb.0: 2362; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 2363; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2364; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 2365; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2366; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2367; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2368; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2369; AVX1-NEXT: retq 2370; 2371; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2: 2372; AVX2ORLATER: # %bb.0: 2373; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1 2374; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2375; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1 2376; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2377; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2378; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2379; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 2380; AVX2ORLATER-NEXT: retq 2381; 2382; XOP-LABEL: non_splat_minus_one_divisor_2: 2383; XOP: # %bb.0: 2384; XOP-NEXT: vpsrld $31, %xmm0, %xmm1 2385; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2386; XOP-NEXT: vpsrad $1, %xmm1, %xmm1 2387; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2388; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2389; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2390; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2391; XOP-NEXT: retq 2392 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2> 2393 ret <4 x i32> %div 2394} 2395 2396define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { 2397; SSE-LABEL: combine_vec_sdiv_nonuniform: 2398; SSE: # %bb.0: 2399; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [21846,21846,21846,21846,2979,2979,2979,2979] 2400; SSE-NEXT: movdqa %xmm0, %xmm1 2401; SSE-NEXT: psrlw $15, %xmm1 2402; SSE-NEXT: paddw %xmm1, %xmm0 2403; SSE-NEXT: retq 2404; 2405; AVX-LABEL: combine_vec_sdiv_nonuniform: 2406; AVX: # %bb.0: 2407; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [21846,21846,21846,21846,2979,2979,2979,2979] 2408; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 2409; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2410; AVX-NEXT: retq 2411 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22> 2412 ret <8 x i16> %1 2413} 2414 2415define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { 2416; SSE2-LABEL: combine_vec_sdiv_nonuniform2: 2417; SSE2: # %bb.0: 2418; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2419; SSE2-NEXT: movdqa %xmm0, %xmm1 2420; SSE2-NEXT: psraw $2, %xmm1 2421; SSE2-NEXT: movdqa %xmm0, %xmm2 2422; SSE2-NEXT: psraw $1, %xmm2 2423; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 2424; SSE2-NEXT: psrlw $15, %xmm0 2425; SSE2-NEXT: paddw %xmm2, %xmm0 2426; SSE2-NEXT: retq 2427; 2428; SSE41-LABEL: combine_vec_sdiv_nonuniform2: 2429; SSE41: # %bb.0: 2430; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2431; SSE41-NEXT: movdqa %xmm0, %xmm1 2432; SSE41-NEXT: psraw $1, %xmm1 2433; SSE41-NEXT: movdqa %xmm0, %xmm2 2434; SSE41-NEXT: psraw $2, %xmm2 2435; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2436; SSE41-NEXT: psrlw $15, %xmm0 2437; SSE41-NEXT: paddw %xmm2, %xmm0 2438; SSE41-NEXT: retq 2439; 2440; AVX1-LABEL: combine_vec_sdiv_nonuniform2: 2441; AVX1: # %bb.0: 2442; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2443; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 2444; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 2445; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2446; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2447; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2448; AVX1-NEXT: retq 2449; 2450; AVX2-LABEL: combine_vec_sdiv_nonuniform2: 2451; AVX2: # %bb.0: 2452; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2453; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1 2454; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2 2455; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2456; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2457; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2458; AVX2-NEXT: retq 2459; 2460; AVX512F-LABEL: combine_vec_sdiv_nonuniform2: 2461; AVX512F: # %bb.0: 2462; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2463; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1 2464; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2 2465; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2466; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2467; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2468; AVX512F-NEXT: retq 2469; 2470; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2: 2471; AVX512BW: # %bb.0: 2472; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2473; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2474; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2475; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2476; AVX512BW-NEXT: retq 2477; 2478; XOP-LABEL: combine_vec_sdiv_nonuniform2: 2479; XOP: # %bb.0: 2480; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243] 2481; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2482; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2483; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2484; XOP-NEXT: retq 2485 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25> 2486 ret <8 x i16> %1 2487} 2488 2489define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { 2490; SSE2-LABEL: combine_vec_sdiv_nonuniform3: 2491; SSE2: # %bb.0: 2492; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2493; SSE2-NEXT: pmulhw %xmm0, %xmm1 2494; SSE2-NEXT: paddw %xmm1, %xmm0 2495; SSE2-NEXT: movdqa %xmm0, %xmm1 2496; SSE2-NEXT: psraw $4, %xmm1 2497; SSE2-NEXT: movdqa %xmm0, %xmm2 2498; SSE2-NEXT: psraw $8, %xmm2 2499; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 2500; SSE2-NEXT: psrlw $15, %xmm0 2501; SSE2-NEXT: paddw %xmm2, %xmm0 2502; SSE2-NEXT: retq 2503; 2504; SSE41-LABEL: combine_vec_sdiv_nonuniform3: 2505; SSE41: # %bb.0: 2506; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2507; SSE41-NEXT: pmulhw %xmm0, %xmm1 2508; SSE41-NEXT: paddw %xmm1, %xmm0 2509; SSE41-NEXT: movdqa %xmm0, %xmm1 2510; SSE41-NEXT: psraw $8, %xmm1 2511; SSE41-NEXT: movdqa %xmm0, %xmm2 2512; SSE41-NEXT: psraw $4, %xmm2 2513; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2514; SSE41-NEXT: psrlw $15, %xmm0 2515; SSE41-NEXT: paddw %xmm2, %xmm0 2516; SSE41-NEXT: retq 2517; 2518; AVX1-LABEL: combine_vec_sdiv_nonuniform3: 2519; AVX1: # %bb.0: 2520; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833] 2521; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2522; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2523; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2524; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2525; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2526; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2527; AVX1-NEXT: retq 2528; 2529; AVX2-LABEL: combine_vec_sdiv_nonuniform3: 2530; AVX2: # %bb.0: 2531; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833] 2532; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2533; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2534; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2535; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2536; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2537; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2538; AVX2-NEXT: retq 2539; 2540; AVX512F-LABEL: combine_vec_sdiv_nonuniform3: 2541; AVX512F: # %bb.0: 2542; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833] 2543; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2544; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2545; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2546; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2547; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2548; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2549; AVX512F-NEXT: retq 2550; 2551; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3: 2552; AVX512BW: # %bb.0: 2553; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833] 2554; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2555; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2556; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2557; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2558; AVX512BW-NEXT: retq 2559; 2560; XOP-LABEL: combine_vec_sdiv_nonuniform3: 2561; XOP: # %bb.0: 2562; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833] 2563; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2564; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2565; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2566; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2567; XOP-NEXT: retq 2568 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511> 2569 ret <8 x i16> %1 2570} 2571 2572define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { 2573; SSE2-LABEL: combine_vec_sdiv_nonuniform4: 2574; SSE2: # %bb.0: 2575; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2576; SSE2-NEXT: pmulhw %xmm0, %xmm1 2577; SSE2-NEXT: psubw %xmm0, %xmm1 2578; SSE2-NEXT: movdqa %xmm1, %xmm0 2579; SSE2-NEXT: psraw $4, %xmm0 2580; SSE2-NEXT: movdqa %xmm1, %xmm2 2581; SSE2-NEXT: psraw $8, %xmm2 2582; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2583; SSE2-NEXT: psrlw $15, %xmm1 2584; SSE2-NEXT: paddw %xmm2, %xmm1 2585; SSE2-NEXT: movdqa %xmm1, %xmm0 2586; SSE2-NEXT: retq 2587; 2588; SSE41-LABEL: combine_vec_sdiv_nonuniform4: 2589; SSE41: # %bb.0: 2590; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2591; SSE41-NEXT: pmulhw %xmm0, %xmm1 2592; SSE41-NEXT: psubw %xmm0, %xmm1 2593; SSE41-NEXT: movdqa %xmm1, %xmm0 2594; SSE41-NEXT: psraw $8, %xmm0 2595; SSE41-NEXT: movdqa %xmm1, %xmm2 2596; SSE41-NEXT: psraw $4, %xmm2 2597; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 2598; SSE41-NEXT: psrlw $15, %xmm1 2599; SSE41-NEXT: paddw %xmm2, %xmm1 2600; SSE41-NEXT: movdqa %xmm1, %xmm0 2601; SSE41-NEXT: retq 2602; 2603; AVX1-LABEL: combine_vec_sdiv_nonuniform4: 2604; AVX1: # %bb.0: 2605; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639] 2606; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2607; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2608; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2609; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2610; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2611; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2612; AVX1-NEXT: retq 2613; 2614; AVX2-LABEL: combine_vec_sdiv_nonuniform4: 2615; AVX2: # %bb.0: 2616; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639] 2617; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2618; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2619; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2620; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2621; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2622; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2623; AVX2-NEXT: retq 2624; 2625; AVX512F-LABEL: combine_vec_sdiv_nonuniform4: 2626; AVX512F: # %bb.0: 2627; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639] 2628; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2629; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2630; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2631; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2632; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2633; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2634; AVX512F-NEXT: retq 2635; 2636; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4: 2637; AVX512BW: # %bb.0: 2638; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639] 2639; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2640; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2641; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2642; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2643; AVX512BW-NEXT: retq 2644; 2645; XOP-LABEL: combine_vec_sdiv_nonuniform4: 2646; XOP: # %bb.0: 2647; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639] 2648; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2649; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2650; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2651; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2652; XOP-NEXT: retq 2653 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510> 2654 ret <8 x i16> %1 2655} 2656 2657define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { 2658; SSE2-LABEL: combine_vec_sdiv_nonuniform5: 2659; SSE2: # %bb.0: 2660; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32639,54613,19945,21846,2979,5243,32897,32833] 2661; SSE2-NEXT: pmulhw %xmm0, %xmm1 2662; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] 2663; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65535,0,65535,0,0,0,1,1] 2664; SSE2-NEXT: paddw %xmm1, %xmm0 2665; SSE2-NEXT: pand %xmm2, %xmm1 2666; SSE2-NEXT: movdqa %xmm0, %xmm3 2667; SSE2-NEXT: psraw $8, %xmm3 2668; SSE2-NEXT: pandn %xmm3, %xmm2 2669; SSE2-NEXT: por %xmm1, %xmm2 2670; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,0,65535] 2671; SSE2-NEXT: pand %xmm1, %xmm2 2672; SSE2-NEXT: movdqa %xmm0, %xmm3 2673; SSE2-NEXT: psraw $4, %xmm3 2674; SSE2-NEXT: pandn %xmm3, %xmm1 2675; SSE2-NEXT: por %xmm2, %xmm1 2676; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] 2677; SSE2-NEXT: movdqa %xmm1, %xmm3 2678; SSE2-NEXT: pand %xmm2, %xmm3 2679; SSE2-NEXT: psraw $2, %xmm1 2680; SSE2-NEXT: pandn %xmm1, %xmm2 2681; SSE2-NEXT: por %xmm3, %xmm2 2682; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,65535] 2683; SSE2-NEXT: movdqa %xmm2, %xmm3 2684; SSE2-NEXT: pand %xmm1, %xmm3 2685; SSE2-NEXT: psraw $1, %xmm2 2686; SSE2-NEXT: pandn %xmm2, %xmm1 2687; SSE2-NEXT: por %xmm3, %xmm1 2688; SSE2-NEXT: psrlw $15, %xmm0 2689; SSE2-NEXT: paddw %xmm1, %xmm0 2690; SSE2-NEXT: retq 2691; 2692; SSE41-LABEL: combine_vec_sdiv_nonuniform5: 2693; SSE41: # %bb.0: 2694; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] 2695; SSE41-NEXT: pmullw %xmm0, %xmm1 2696; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] 2697; SSE41-NEXT: paddw %xmm0, %xmm1 2698; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [256,16384,4096,u,u,u,512,256] 2699; SSE41-NEXT: pmulhw %xmm1, %xmm2 2700; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 2701; SSE41-NEXT: psraw $1, %xmm0 2702; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] 2703; SSE41-NEXT: psrlw $15, %xmm1 2704; SSE41-NEXT: paddw %xmm1, %xmm0 2705; SSE41-NEXT: retq 2706; 2707; AVX1-LABEL: combine_vec_sdiv_nonuniform5: 2708; AVX1: # %bb.0: 2709; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] 2710; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] 2711; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 2712; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256] 2713; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 2714; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 2715; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] 2716; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1 2717; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2718; AVX1-NEXT: retq 2719; 2720; AVX2-LABEL: combine_vec_sdiv_nonuniform5: 2721; AVX2: # %bb.0: 2722; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] 2723; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] 2724; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 2725; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256] 2726; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 2727; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 2728; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] 2729; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 2730; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2731; AVX2-NEXT: retq 2732; 2733; AVX512F-LABEL: combine_vec_sdiv_nonuniform5: 2734; AVX512F: # %bb.0: 2735; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] 2736; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] 2737; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2738; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2739; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2740; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2741; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2742; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2743; AVX512F-NEXT: vzeroupper 2744; AVX512F-NEXT: retq 2745; 2746; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5: 2747; AVX512BW: # %bb.0: 2748; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] 2749; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] 2750; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2751; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2752; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2753; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2754; AVX512BW-NEXT: retq 2755; 2756; XOP-LABEL: combine_vec_sdiv_nonuniform5: 2757; XOP: # %bb.0: 2758; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32639,54613,19945,21846,2979,5243,32897,32833] 2759; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2760; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2761; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2762; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2763; XOP-NEXT: retq 2764 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511> 2765 ret <8 x i16> %1 2766} 2767 2768define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { 2769; SSE2-LABEL: combine_vec_sdiv_nonuniform6: 2770; SSE2: # %bb.0: 2771; SSE2-NEXT: movdqa %xmm0, %xmm1 2772; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,1,1,1,0] 2773; SSE2-NEXT: pmullw %xmm1, %xmm0 2774; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32767,32767,32703,0,0,32897,32769,16385] 2775; SSE2-NEXT: paddw %xmm1, %xmm0 2776; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] 2777; SSE2-NEXT: movdqa %xmm0, %xmm3 2778; SSE2-NEXT: psraw $8, %xmm3 2779; SSE2-NEXT: pand %xmm2, %xmm3 2780; SSE2-NEXT: pandn %xmm0, %xmm2 2781; SSE2-NEXT: por %xmm3, %xmm2 2782; SSE2-NEXT: movdqa %xmm2, %xmm3 2783; SSE2-NEXT: psraw $6, %xmm3 2784; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] 2785; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,0] 2786; SSE2-NEXT: pand %xmm5, %xmm2 2787; SSE2-NEXT: psraw $12, %xmm1 2788; SSE2-NEXT: pandn %xmm1, %xmm5 2789; SSE2-NEXT: por %xmm2, %xmm5 2790; SSE2-NEXT: pand %xmm4, %xmm5 2791; SSE2-NEXT: pandn %xmm3, %xmm4 2792; SSE2-NEXT: por %xmm5, %xmm4 2793; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0] 2794; SSE2-NEXT: movdqa %xmm4, %xmm2 2795; SSE2-NEXT: pand %xmm1, %xmm2 2796; SSE2-NEXT: psraw $1, %xmm4 2797; SSE2-NEXT: pandn %xmm4, %xmm1 2798; SSE2-NEXT: por %xmm2, %xmm1 2799; SSE2-NEXT: psrlw $15, %xmm0 2800; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2801; SSE2-NEXT: paddw %xmm1, %xmm0 2802; SSE2-NEXT: retq 2803; 2804; SSE41-LABEL: combine_vec_sdiv_nonuniform6: 2805; SSE41: # %bb.0: 2806; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] 2807; SSE41-NEXT: pmullw %xmm0, %xmm1 2808; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] 2809; SSE41-NEXT: paddw %xmm1, %xmm0 2810; SSE41-NEXT: movdqa %xmm0, %xmm2 2811; SSE41-NEXT: psrlw $15, %xmm2 2812; SSE41-NEXT: pxor %xmm3, %xmm3 2813; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 2814; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,u,u,512,256,8] 2815; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 2816; SSE41-NEXT: paddw %xmm3, %xmm0 2817; SSE41-NEXT: retq 2818; 2819; AVX1-LABEL: combine_vec_sdiv_nonuniform6: 2820; AVX1: # %bb.0: 2821; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] 2822; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] 2823; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2824; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 2825; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2826; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 2827; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] 2828; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 2829; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 2830; AVX1-NEXT: retq 2831; 2832; AVX2-LABEL: combine_vec_sdiv_nonuniform6: 2833; AVX2: # %bb.0: 2834; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] 2835; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] 2836; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2837; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm2 2838; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 2839; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 2840; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] 2841; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 2842; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 2843; AVX2-NEXT: retq 2844; 2845; AVX512F-LABEL: combine_vec_sdiv_nonuniform6: 2846; AVX512F: # %bb.0: 2847; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] 2848; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] 2849; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2850; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2851; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2852; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2853; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2854; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2855; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2856; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2857; AVX512F-NEXT: vzeroupper 2858; AVX512F-NEXT: retq 2859; 2860; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6: 2861; AVX512BW: # %bb.0: 2862; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] 2863; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] 2864; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2865; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2866; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2867; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2868; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2869; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2870; AVX512BW-NEXT: retq 2871; 2872; XOP-LABEL: combine_vec_sdiv_nonuniform6: 2873; XOP: # %bb.0: 2874; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32767,32767,32703,0,0,32897,32769,16385] 2875; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2876; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2877; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2878; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2879; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2880; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2881; XOP-NEXT: retq 2882 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767> 2883 ret <8 x i16> %1 2884} 2885 2886define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { 2887; SSE2-LABEL: combine_vec_sdiv_nonuniform7: 2888; SSE2: # %bb.0: 2889; SSE2-NEXT: pxor %xmm1, %xmm1 2890; SSE2-NEXT: psubw %xmm0, %xmm1 2891; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2892; SSE2-NEXT: retq 2893; 2894; SSE41-LABEL: combine_vec_sdiv_nonuniform7: 2895; SSE41: # %bb.0: 2896; SSE41-NEXT: pxor %xmm1, %xmm1 2897; SSE41-NEXT: psubw %xmm0, %xmm1 2898; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2899; SSE41-NEXT: retq 2900; 2901; AVX1-LABEL: combine_vec_sdiv_nonuniform7: 2902; AVX1: # %bb.0: 2903; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2904; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2905; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2906; AVX1-NEXT: retq 2907; 2908; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: 2909; AVX2ORLATER: # %bb.0: 2910; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2911; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2912; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2913; AVX2ORLATER-NEXT: retq 2914; 2915; XOP-LABEL: combine_vec_sdiv_nonuniform7: 2916; XOP: # %bb.0: 2917; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2918; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2919; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2920; XOP-NEXT: retq 2921 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1> 2922 ret <8 x i16> %1 2923} 2924 2925define <16 x i8> @pr38658(<16 x i8> %x) { 2926; SSE2-LABEL: pr38658: 2927; SSE2: # %bb.0: 2928; SSE2-NEXT: pxor %xmm1, %xmm1 2929; SSE2-NEXT: movdqa %xmm0, %xmm2 2930; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2931; SSE2-NEXT: pxor %xmm3, %xmm3 2932; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2933; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632] 2934; SSE2-NEXT: psrlw $8, %xmm3 2935; SSE2-NEXT: packuswb %xmm3, %xmm1 2936; SSE2-NEXT: paddb %xmm1, %xmm0 2937; SSE2-NEXT: movdqa %xmm0, %xmm1 2938; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2939; SSE2-NEXT: psraw $8, %xmm1 2940; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,256,256,256,256,256,256,64] 2941; SSE2-NEXT: psrlw $8, %xmm1 2942; SSE2-NEXT: packuswb %xmm1, %xmm2 2943; SSE2-NEXT: psrlw $7, %xmm0 2944; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2945; SSE2-NEXT: paddb %xmm2, %xmm0 2946; SSE2-NEXT: retq 2947; 2948; SSE41-LABEL: pr38658: 2949; SSE41: # %bb.0: 2950; SSE41-NEXT: pxor %xmm1, %xmm1 2951; SSE41-NEXT: pxor %xmm2, %xmm2 2952; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2953; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632] 2954; SSE41-NEXT: psrlw $8, %xmm2 2955; SSE41-NEXT: packuswb %xmm2, %xmm1 2956; SSE41-NEXT: paddb %xmm0, %xmm1 2957; SSE41-NEXT: movdqa %xmm1, %xmm2 2958; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2959; SSE41-NEXT: psraw $8, %xmm2 2960; SSE41-NEXT: movdqa %xmm2, %xmm3 2961; SSE41-NEXT: psllw $6, %xmm3 2962; SSE41-NEXT: psllw $8, %xmm2 2963; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 2964; SSE41-NEXT: psrlw $8, %xmm2 2965; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2966; SSE41-NEXT: packuswb %xmm2, %xmm0 2967; SSE41-NEXT: psrlw $7, %xmm1 2968; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2969; SSE41-NEXT: paddb %xmm0, %xmm1 2970; SSE41-NEXT: movdqa %xmm1, %xmm0 2971; SSE41-NEXT: retq 2972; 2973; AVX1-LABEL: pr38658: 2974; AVX1: # %bb.0: 2975; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2976; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2977; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632] 2978; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2979; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2980; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 2981; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2982; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 2983; AVX1-NEXT: vpsllw $6, %xmm2, %xmm3 2984; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 2985; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 2986; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2987; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2988; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2989; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 2990; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2991; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2992; AVX1-NEXT: retq 2993; 2994; AVX2-LABEL: pr38658: 2995; AVX2: # %bb.0: 2996; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 2997; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427] 2998; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2999; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3000; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3001; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3002; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 3003; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,64] 3004; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 3005; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3006; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3007; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 3008; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3009; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3010; AVX2-NEXT: vzeroupper 3011; AVX2-NEXT: retq 3012; 3013; AVX512F-LABEL: pr38658: 3014; AVX512F: # %bb.0: 3015; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 3016; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427] 3017; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 3018; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 3019; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 3020; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3021; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1 3022; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3023; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 3024; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 3025; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3026; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3027; AVX512F-NEXT: vzeroupper 3028; AVX512F-NEXT: retq 3029; 3030; AVX512BW-LABEL: pr38658: 3031; AVX512BW: # %bb.0: 3032; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 3033; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427] 3034; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 3035; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 3036; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3037; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 3038; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3039; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 3040; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3041; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 3042; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3043; AVX512BW-NEXT: vzeroupper 3044; AVX512BW-NEXT: retq 3045; 3046; XOP-LABEL: pr38658: 3047; XOP: # %bb.0: 3048; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 3049; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 3050; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632] 3051; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] 3052; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3053; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 3054; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3055; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3056; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3057; XOP-NEXT: retq 3058 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7> 3059 ret <16 x i8> %1 3060} 3061 3062define i1 @bool_sdiv(i1 %x, i1 %y) { 3063; CHECK-LABEL: bool_sdiv: 3064; CHECK: # %bb.0: 3065; CHECK-NEXT: movl %edi, %eax 3066; CHECK-NEXT: # kill: def $al killed $al killed $eax 3067; CHECK-NEXT: retq 3068 %r = sdiv i1 %x, %y 3069 ret i1 %r 3070} 3071 3072define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) { 3073; CHECK-LABEL: boolvec_sdiv: 3074; CHECK: # %bb.0: 3075; CHECK-NEXT: retq 3076 %r = sdiv <4 x i1> %x, %y 3077 ret <4 x i1> %r 3078} 3079 3080define i32 @combine_sdiv_two(i32 %x) { 3081; CHECK-LABEL: combine_sdiv_two: 3082; CHECK: # %bb.0: 3083; CHECK-NEXT: movl %edi, %eax 3084; CHECK-NEXT: shrl $31, %eax 3085; CHECK-NEXT: addl %edi, %eax 3086; CHECK-NEXT: sarl %eax 3087; CHECK-NEXT: retq 3088 %1 = sdiv i32 %x, 2 3089 ret i32 %1 3090} 3091 3092define i32 @combine_sdiv_negtwo(i32 %x) { 3093; CHECK-LABEL: combine_sdiv_negtwo: 3094; CHECK: # %bb.0: 3095; CHECK-NEXT: movl %edi, %eax 3096; CHECK-NEXT: shrl $31, %eax 3097; CHECK-NEXT: addl %edi, %eax 3098; CHECK-NEXT: sarl %eax 3099; CHECK-NEXT: negl %eax 3100; CHECK-NEXT: retq 3101 %1 = sdiv i32 %x, -2 3102 ret i32 %1 3103} 3104 3105define i8 @combine_i8_sdiv_pow2(i8 %x) { 3106; CHECK-LABEL: combine_i8_sdiv_pow2: 3107; CHECK: # %bb.0: 3108; CHECK-NEXT: movl %edi, %eax 3109; CHECK-NEXT: sarb $7, %al 3110; CHECK-NEXT: shrb $4, %al 3111; CHECK-NEXT: addb %dil, %al 3112; CHECK-NEXT: sarb $4, %al 3113; CHECK-NEXT: retq 3114 %1 = sdiv i8 %x, 16 3115 ret i8 %1 3116} 3117 3118define i8 @combine_i8_sdiv_negpow2(i8 %x) { 3119; CHECK-LABEL: combine_i8_sdiv_negpow2: 3120; CHECK: # %bb.0: 3121; CHECK-NEXT: movl %edi, %eax 3122; CHECK-NEXT: sarb $7, %al 3123; CHECK-NEXT: shrb $2, %al 3124; CHECK-NEXT: addb %dil, %al 3125; CHECK-NEXT: sarb $6, %al 3126; CHECK-NEXT: negb %al 3127; CHECK-NEXT: retq 3128 %1 = sdiv i8 %x, -64 3129 ret i8 %1 3130} 3131 3132define i16 @combine_i16_sdiv_pow2(i16 %x) { 3133; CHECK-LABEL: combine_i16_sdiv_pow2: 3134; CHECK: # %bb.0: 3135; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3136; CHECK-NEXT: leal 15(%rdi), %eax 3137; CHECK-NEXT: testw %di, %di 3138; CHECK-NEXT: cmovnsl %edi, %eax 3139; CHECK-NEXT: cwtl 3140; CHECK-NEXT: shrl $4, %eax 3141; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3142; CHECK-NEXT: retq 3143 %1 = sdiv i16 %x, 16 3144 ret i16 %1 3145} 3146 3147define i16 @combine_i16_sdiv_negpow2(i16 %x) { 3148; CHECK-LABEL: combine_i16_sdiv_negpow2: 3149; CHECK: # %bb.0: 3150; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3151; CHECK-NEXT: leal 255(%rdi), %eax 3152; CHECK-NEXT: testw %di, %di 3153; CHECK-NEXT: cmovnsl %edi, %eax 3154; CHECK-NEXT: cwtl 3155; CHECK-NEXT: sarl $8, %eax 3156; CHECK-NEXT: negl %eax 3157; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3158; CHECK-NEXT: retq 3159 %1 = sdiv i16 %x, -256 3160 ret i16 %1 3161} 3162 3163define i32 @combine_i32_sdiv_pow2(i32 %x) { 3164; CHECK-LABEL: combine_i32_sdiv_pow2: 3165; CHECK: # %bb.0: 3166; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3167; CHECK-NEXT: leal 15(%rdi), %eax 3168; CHECK-NEXT: testl %edi, %edi 3169; CHECK-NEXT: cmovnsl %edi, %eax 3170; CHECK-NEXT: sarl $4, %eax 3171; CHECK-NEXT: retq 3172 %1 = sdiv i32 %x, 16 3173 ret i32 %1 3174} 3175 3176define i32 @combine_i32_sdiv_negpow2(i32 %x) { 3177; CHECK-LABEL: combine_i32_sdiv_negpow2: 3178; CHECK: # %bb.0: 3179; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3180; CHECK-NEXT: leal 255(%rdi), %eax 3181; CHECK-NEXT: testl %edi, %edi 3182; CHECK-NEXT: cmovnsl %edi, %eax 3183; CHECK-NEXT: sarl $8, %eax 3184; CHECK-NEXT: negl %eax 3185; CHECK-NEXT: retq 3186 %1 = sdiv i32 %x, -256 3187 ret i32 %1 3188} 3189 3190define i64 @combine_i64_sdiv_pow2(i64 %x) { 3191; CHECK-LABEL: combine_i64_sdiv_pow2: 3192; CHECK: # %bb.0: 3193; CHECK-NEXT: leaq 15(%rdi), %rax 3194; CHECK-NEXT: testq %rdi, %rdi 3195; CHECK-NEXT: cmovnsq %rdi, %rax 3196; CHECK-NEXT: sarq $4, %rax 3197; CHECK-NEXT: retq 3198 %1 = sdiv i64 %x, 16 3199 ret i64 %1 3200} 3201 3202define i64 @combine_i64_sdiv_negpow2(i64 %x) { 3203; CHECK-LABEL: combine_i64_sdiv_negpow2: 3204; CHECK: # %bb.0: 3205; CHECK-NEXT: leaq 255(%rdi), %rax 3206; CHECK-NEXT: testq %rdi, %rdi 3207; CHECK-NEXT: cmovnsq %rdi, %rax 3208; CHECK-NEXT: sarq $8, %rax 3209; CHECK-NEXT: negq %rax 3210; CHECK-NEXT: retq 3211 %1 = sdiv i64 %x, -256 3212 ret i64 %1 3213} 3214