1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 4 5; fold (mul x, 0) -> 0 6define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) { 7; SSE-LABEL: combine_vec_mul_zero: 8; SSE: # %bb.0: 9; SSE-NEXT: xorps %xmm0, %xmm0 10; SSE-NEXT: retq 11; 12; AVX-LABEL: combine_vec_mul_zero: 13; AVX: # %bb.0: 14; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 15; AVX-NEXT: retq 16 %1 = mul <4 x i32> %x, zeroinitializer 17 ret <4 x i32> %1 18} 19 20; fold (mul x, 1) -> x 21define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) { 22; SSE-LABEL: combine_vec_mul_one: 23; SSE: # %bb.0: 24; SSE-NEXT: retq 25; 26; AVX-LABEL: combine_vec_mul_one: 27; AVX: # %bb.0: 28; AVX-NEXT: retq 29 %1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 30 ret <4 x i32> %1 31} 32 33; fold (mul x, -1) -> 0-x 34define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) { 35; SSE-LABEL: combine_vec_mul_negone: 36; SSE: # %bb.0: 37; SSE-NEXT: pxor %xmm1, %xmm1 38; SSE-NEXT: psubd %xmm0, %xmm1 39; SSE-NEXT: movdqa %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: combine_vec_mul_negone: 43; AVX: # %bb.0: 44; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 45; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 46; AVX-NEXT: retq 47 %1 = mul <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 48 ret <4 x i32> %1 49} 50 51; fold (mul x, (1 << c)) -> x << c 52define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) { 53; SSE-LABEL: combine_vec_mul_pow2a: 54; SSE: # %bb.0: 55; SSE-NEXT: paddd %xmm0, %xmm0 56; SSE-NEXT: retq 57; 58; AVX-LABEL: combine_vec_mul_pow2a: 59; AVX: # %bb.0: 60; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 61; AVX-NEXT: retq 62 %1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 63 ret <4 x i32> %1 64} 65 66define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) { 67; SSE-LABEL: combine_vec_mul_pow2b: 68; SSE: # %bb.0: 69; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 70; SSE-NEXT: retq 71; 72; AVX-LABEL: combine_vec_mul_pow2b: 73; AVX: # %bb.0: 74; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 75; AVX-NEXT: retq 76 %1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16> 77 ret <4 x i32> %1 78} 79 80define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) { 81; SSE-LABEL: combine_vec_mul_pow2c: 82; SSE: # %bb.0: 83; SSE-NEXT: movdqa %xmm0, %xmm2 84; SSE-NEXT: paddq %xmm0, %xmm2 85; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 86; SSE-NEXT: movdqa %xmm1, %xmm2 87; SSE-NEXT: psllq $4, %xmm2 88; SSE-NEXT: psllq $2, %xmm1 89; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 90; SSE-NEXT: retq 91; 92; AVX-LABEL: combine_vec_mul_pow2c: 93; AVX: # %bb.0: 94; AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 95; AVX-NEXT: retq 96 %1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16> 97 ret <4 x i64> %1 98} 99 100; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c 101define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) { 102; SSE-LABEL: combine_vec_mul_negpow2a: 103; SSE: # %bb.0: 104; SSE-NEXT: paddd %xmm0, %xmm0 105; SSE-NEXT: pxor %xmm1, %xmm1 106; SSE-NEXT: psubd %xmm0, %xmm1 107; SSE-NEXT: movdqa %xmm1, %xmm0 108; SSE-NEXT: retq 109; 110; AVX-LABEL: combine_vec_mul_negpow2a: 111; AVX: # %bb.0: 112; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 113; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 114; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 115; AVX-NEXT: retq 116 %1 = mul <4 x i32> %x, <i32 -2, i32 -2, i32 -2, i32 -2> 117 ret <4 x i32> %1 118} 119 120define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) { 121; SSE-LABEL: combine_vec_mul_negpow2b: 122; SSE: # %bb.0: 123; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 124; SSE-NEXT: retq 125; 126; AVX-LABEL: combine_vec_mul_negpow2b: 127; AVX: # %bb.0: 128; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 129; AVX-NEXT: retq 130 %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16> 131 ret <4 x i32> %1 132} 133 134define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { 135; SSE-LABEL: combine_vec_mul_negpow2c: 136; SSE: # %bb.0: 137; SSE-NEXT: pmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0] 138; SSE-NEXT: movdqa %xmm0, %xmm3 139; SSE-NEXT: pmuludq %xmm2, %xmm3 140; SSE-NEXT: movdqa %xmm0, %xmm4 141; SSE-NEXT: psrlq $32, %xmm4 142; SSE-NEXT: pmovsxbq {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] 143; SSE-NEXT: pmuludq %xmm5, %xmm4 144; SSE-NEXT: paddq %xmm3, %xmm4 145; SSE-NEXT: psllq $32, %xmm4 146; SSE-NEXT: pmuludq %xmm5, %xmm0 147; SSE-NEXT: paddq %xmm4, %xmm0 148; SSE-NEXT: pmuludq %xmm1, %xmm2 149; SSE-NEXT: movdqa %xmm1, %xmm3 150; SSE-NEXT: psrlq $32, %xmm3 151; SSE-NEXT: pmovsxbq {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] 152; SSE-NEXT: pmuludq %xmm4, %xmm3 153; SSE-NEXT: paddq %xmm2, %xmm3 154; SSE-NEXT: psllq $32, %xmm3 155; SSE-NEXT: pmuludq %xmm4, %xmm1 156; SSE-NEXT: paddq %xmm3, %xmm1 157; SSE-NEXT: retq 158; 159; AVX-LABEL: combine_vec_mul_negpow2c: 160; AVX: # %bb.0: 161; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 162; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 163; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 164; AVX-NEXT: vpmovsxbq {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] 165; AVX-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 166; AVX-NEXT: vpaddq %ymm2, %ymm1, %ymm1 167; AVX-NEXT: vpsllq $32, %ymm1, %ymm1 168; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 169; AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 170; AVX-NEXT: retq 171 %1 = mul <4 x i64> %x, <i64 -1, i64 -2, i64 -4, i64 -16> 172 ret <4 x i64> %1 173} 174 175; (mul (shl X, c1), c2) -> (mul X, c2 << c1) 176define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) { 177; SSE-LABEL: combine_vec_mul_shl_const: 178; SSE: # %bb.0: 179; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 180; SSE-NEXT: retq 181; 182; AVX-LABEL: combine_vec_mul_shl_const: 183; AVX: # %bb.0: 184; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 185; AVX-NEXT: retq 186 %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 187 %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7> 188 ret <4 x i32> %2 189} 190 191; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use. 192define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) { 193; SSE-LABEL: combine_vec_mul_shl_oneuse0: 194; SSE: # %bb.0: 195; SSE-NEXT: pmulld %xmm1, %xmm0 196; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 197; SSE-NEXT: retq 198; 199; AVX-LABEL: combine_vec_mul_shl_oneuse0: 200; AVX: # %bb.0: 201; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 202; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 203; AVX-NEXT: retq 204 %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 205 %2 = mul <4 x i32> %1, %y 206 ret <4 x i32> %2 207} 208 209define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) { 210; SSE-LABEL: combine_vec_mul_shl_oneuse1: 211; SSE: # %bb.0: 212; SSE-NEXT: pmulld %xmm1, %xmm0 213; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 214; SSE-NEXT: retq 215; 216; AVX-LABEL: combine_vec_mul_shl_oneuse1: 217; AVX: # %bb.0: 218; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 219; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 220; AVX-NEXT: retq 221 %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 222 %2 = mul <4 x i32> %y, %1 223 ret <4 x i32> %2 224} 225 226define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) { 227; SSE-LABEL: combine_vec_mul_shl_multiuse0: 228; SSE: # %bb.0: 229; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 230; SSE-NEXT: pmulld %xmm0, %xmm1 231; SSE-NEXT: paddd %xmm1, %xmm0 232; SSE-NEXT: retq 233; 234; AVX-LABEL: combine_vec_mul_shl_multiuse0: 235; AVX: # %bb.0: 236; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 237; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 238; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 239; AVX-NEXT: retq 240 %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 241 %2 = mul <4 x i32> %1, %y 242 %3 = add <4 x i32> %1, %2 243 ret <4 x i32> %3 244} 245 246define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) { 247; SSE-LABEL: combine_vec_mul_shl_multiuse1: 248; SSE: # %bb.0: 249; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 250; SSE-NEXT: pmulld %xmm0, %xmm1 251; SSE-NEXT: paddd %xmm1, %xmm0 252; SSE-NEXT: retq 253; 254; AVX-LABEL: combine_vec_mul_shl_multiuse1: 255; AVX: # %bb.0: 256; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 257; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm1 258; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 259; AVX-NEXT: retq 260 %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 261 %2 = mul <4 x i32> %y, %1 262 %3 = add <4 x i32> %1, %2 263 ret <4 x i32> %3 264} 265 266; fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) 267 268define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) { 269; SSE-LABEL: combine_vec_mul_add: 270; SSE: # %bb.0: 271; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 272; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 273; SSE-NEXT: retq 274; 275; AVX-LABEL: combine_vec_mul_add: 276; AVX: # %bb.0: 277; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 278; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 279; AVX-NEXT: retq 280 %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16> 281 %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0> 282 ret <4 x i32> %2 283} 284 285; fold Y = sra (X, size(X)-1); mul (or (Y, 1), X) -> (abs X) 286 287define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) { 288; SSE-LABEL: combine_mul_to_abs_v16i8: 289; SSE: # %bb.0: 290; SSE-NEXT: pabsb %xmm0, %xmm0 291; SSE-NEXT: retq 292; 293; AVX-LABEL: combine_mul_to_abs_v16i8: 294; AVX: # %bb.0: 295; AVX-NEXT: vpabsb %xmm0, %xmm0 296; AVX-NEXT: retq 297 %s = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 298 %o = or <16 x i8> %s, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 299 %m = mul <16 x i8> %o, %x 300 ret <16 x i8> %m 301} 302 303define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) { 304; SSE-LABEL: combine_mul_to_abs_v2i64: 305; SSE: # %bb.0: 306; SSE-NEXT: pxor %xmm1, %xmm1 307; SSE-NEXT: psubq %xmm0, %xmm1 308; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm0 309; SSE-NEXT: retq 310; 311; AVX-LABEL: combine_mul_to_abs_v2i64: 312; AVX: # %bb.0: 313; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 314; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 315; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 316; AVX-NEXT: retq 317 %s = ashr <2 x i64> %x, <i64 63, i64 63> 318 %o = or <2 x i64> %s, <i64 1, i64 1> 319 %m = mul <2 x i64> %x, %o 320 ret <2 x i64> %m 321} 322 323; 'Quadratic Reciprocity' - and(mul(x,x),2) -> 0 324 325define i64 @combine_mul_self_knownbits(i64 %x) { 326; SSE-LABEL: combine_mul_self_knownbits: 327; SSE: # %bb.0: 328; SSE-NEXT: xorl %eax, %eax 329; SSE-NEXT: retq 330; 331; AVX-LABEL: combine_mul_self_knownbits: 332; AVX: # %bb.0: 333; AVX-NEXT: xorl %eax, %eax 334; AVX-NEXT: retq 335 %1 = mul i64 %x, %x 336 %2 = and i64 %1, 2 337 ret i64 %2 338} 339 340define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) { 341; SSE-LABEL: combine_mul_self_knownbits_vector: 342; SSE: # %bb.0: 343; SSE-NEXT: xorps %xmm0, %xmm0 344; SSE-NEXT: retq 345; 346; AVX-LABEL: combine_mul_self_knownbits_vector: 347; AVX: # %bb.0: 348; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 349; AVX-NEXT: retq 350 %1 = mul <4 x i32> %x, %x 351 %2 = and <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 352 ret <4 x i32> %2 353} 354 355; mul(x,x) - bit[1] is 0, but if demanding the other bits the source must not be undef 356 357define i64 @combine_mul_self_demandedbits(i64 %x) { 358; SSE-LABEL: combine_mul_self_demandedbits: 359; SSE: # %bb.0: 360; SSE-NEXT: movq %rdi, %rax 361; SSE-NEXT: imulq %rdi, %rax 362; SSE-NEXT: retq 363; 364; AVX-LABEL: combine_mul_self_demandedbits: 365; AVX: # %bb.0: 366; AVX-NEXT: movq %rdi, %rax 367; AVX-NEXT: imulq %rdi, %rax 368; AVX-NEXT: retq 369 %1 = mul i64 %x, %x 370 %2 = and i64 %1, -3 371 ret i64 %2 372} 373 374define <4 x i32> @combine_mul_self_demandedbits_vector(<4 x i32> %x) { 375; SSE-LABEL: combine_mul_self_demandedbits_vector: 376; SSE: # %bb.0: 377; SSE-NEXT: pmulld %xmm0, %xmm0 378; SSE-NEXT: retq 379; 380; AVX-LABEL: combine_mul_self_demandedbits_vector: 381; AVX: # %bb.0: 382; AVX-NEXT: vpmulld %xmm0, %xmm0, %xmm0 383; AVX-NEXT: retq 384 %1 = freeze <4 x i32> %x 385 %2 = mul <4 x i32> %1, %1 386 %3 = and <4 x i32> %2, <i32 -3, i32 -3, i32 -3, i32 -3> 387 ret <4 x i32> %3 388} 389 390; PR59217 - Reuse umul_lohi/smul_lohi node 391 392define i64 @combine_mul_umul_lohi_i64(i64 %a, i64 %b) { 393; SSE-LABEL: combine_mul_umul_lohi_i64: 394; SSE: # %bb.0: 395; SSE-NEXT: movq %rdi, %rax 396; SSE-NEXT: mulq %rsi 397; SSE-NEXT: xorq %rdx, %rax 398; SSE-NEXT: retq 399; 400; AVX-LABEL: combine_mul_umul_lohi_i64: 401; AVX: # %bb.0: 402; AVX-NEXT: movq %rdi, %rax 403; AVX-NEXT: mulq %rsi 404; AVX-NEXT: xorq %rdx, %rax 405; AVX-NEXT: retq 406 %a128 = zext i64 %a to i128 407 %b128 = zext i64 %b to i128 408 %m128 = mul nuw i128 %a128, %b128 409 %hi128 = lshr i128 %m128, 64 410 %hi = trunc i128 %hi128 to i64 411 %lo = mul i64 %a, %b 412 %r = xor i64 %lo, %hi 413 ret i64 %r 414} 415 416define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) { 417; SSE-LABEL: combine_mul_smul_lohi_commute_i64: 418; SSE: # %bb.0: 419; SSE-NEXT: movq %rdi, %rax 420; SSE-NEXT: imulq %rsi 421; SSE-NEXT: xorq %rdx, %rax 422; SSE-NEXT: retq 423; 424; AVX-LABEL: combine_mul_smul_lohi_commute_i64: 425; AVX: # %bb.0: 426; AVX-NEXT: movq %rdi, %rax 427; AVX-NEXT: imulq %rsi 428; AVX-NEXT: xorq %rdx, %rax 429; AVX-NEXT: retq 430 %a128 = sext i64 %a to i128 431 %b128 = sext i64 %b to i128 432 %m128 = mul nsw i128 %a128, %b128 433 %hi128 = lshr i128 %m128, 64 434 %hi = trunc i128 %hi128 to i64 435 %lo = mul i64 %b, %a 436 %r = xor i64 %lo, %hi 437 ret i64 %r 438} 439 440define i64 @combine_mul_umul_lohi_const_i64(i64 %h) { 441; SSE-LABEL: combine_mul_umul_lohi_const_i64: 442; SSE: # %bb.0: 443; SSE-NEXT: movq %rdi, %rax 444; SSE-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53 445; SSE-NEXT: mulq %rcx 446; SSE-NEXT: xorq %rdx, %rax 447; SSE-NEXT: retq 448; 449; AVX-LABEL: combine_mul_umul_lohi_const_i64: 450; AVX: # %bb.0: 451; AVX-NEXT: movq %rdi, %rax 452; AVX-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53 453; AVX-NEXT: mulq %rcx 454; AVX-NEXT: xorq %rdx, %rax 455; AVX-NEXT: retq 456 %h128 = zext i64 %h to i128 457 %m128 = mul nuw i128 %h128, 14181476777654086739 458 %hi128 = lshr i128 %m128, 64 459 %hi = trunc i128 %hi128 to i64 460 %lo = mul i64 %h, 14181476777654086739 461 %r = xor i64 %lo, %hi 462 ret i64 %r 463} 464 465define i64 @combine_mul_smul_lohi_const_i64(i64 %h) { 466; SSE-LABEL: combine_mul_smul_lohi_const_i64: 467; SSE: # %bb.0: 468; SSE-NEXT: movq %rdi, %rax 469; SSE-NEXT: movq %rdi, %rcx 470; SSE-NEXT: sarq $63, %rcx 471; SSE-NEXT: movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53 472; SSE-NEXT: mulq %rsi 473; SSE-NEXT: imulq %rsi, %rcx 474; SSE-NEXT: addq %rdx, %rcx 475; SSE-NEXT: xorq %rcx, %rax 476; SSE-NEXT: retq 477; 478; AVX-LABEL: combine_mul_smul_lohi_const_i64: 479; AVX: # %bb.0: 480; AVX-NEXT: movq %rdi, %rax 481; AVX-NEXT: movq %rdi, %rcx 482; AVX-NEXT: sarq $63, %rcx 483; AVX-NEXT: movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53 484; AVX-NEXT: mulq %rsi 485; AVX-NEXT: imulq %rsi, %rcx 486; AVX-NEXT: addq %rdx, %rcx 487; AVX-NEXT: xorq %rcx, %rax 488; AVX-NEXT: retq 489 %h128 = sext i64 %h to i128 490 %m128 = mul nsw i128 %h128, 14181476777654086739 491 %hi128 = lshr i128 %m128, 64 492 %hi = trunc i128 %hi128 to i64 493 %lo = mul i64 %h, 14181476777654086739 494 %r = xor i64 %lo, %hi 495 ret i64 %r 496} 497 498; This would infinite loop because DAGCombiner wants to turn this into a shift, 499; but x86 lowering wants to avoid non-uniform vector shift amounts. 500 501define <16 x i8> @PR35579(<16 x i8> %x) { 502; SSE-LABEL: PR35579: 503; SSE: # %bb.0: 504; SSE-NEXT: movdqa %xmm0, %xmm1 505; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 506; SSE-NEXT: psllw $8, %xmm1 507; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0] 508; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 509; SSE-NEXT: por %xmm1, %xmm0 510; SSE-NEXT: retq 511; 512; AVX-LABEL: PR35579: 513; AVX: # %bb.0: 514; AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 515; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1] 516; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 517; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 518; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 519; AVX-NEXT: vzeroupper 520; AVX-NEXT: retq 521 %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1> 522 ret <16 x i8> %r 523} 524 525; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15429 526define <4 x i64> @fuzz15429(<4 x i64> %InVec) { 527; SSE-LABEL: fuzz15429: 528; SSE: # %bb.0: 529; SSE-NEXT: movdqa %xmm1, %xmm2 530; SSE-NEXT: psllq $3, %xmm2 531; SSE-NEXT: psllq $2, %xmm1 532; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 533; SSE-NEXT: paddq %xmm0, %xmm0 534; SSE-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF 535; SSE-NEXT: pinsrq $0, %rax, %xmm0 536; SSE-NEXT: retq 537; 538; AVX-LABEL: fuzz15429: 539; AVX: # %bb.0: 540; AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 541; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF 542; AVX-NEXT: vmovq %rax, %xmm1 543; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 544; AVX-NEXT: retq 545 %mul = mul <4 x i64> %InVec, <i64 1, i64 2, i64 4, i64 8> 546 %I = insertelement <4 x i64> %mul, i64 9223372036854775807, i64 0 547 ret <4 x i64> %I 548} 549