1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=X64-AVX,X64-XOP 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ 10 11; 12; PowOf2 (uniform) 13; 14 15define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind { 16; SSE-LABEL: mul_v2i64_8: 17; SSE: # %bb.0: 18; SSE-NEXT: psllq $3, %xmm0 19; SSE-NEXT: ret{{[l|q]}} 20; 21; X64-AVX-LABEL: mul_v2i64_8: 22; X64-AVX: # %bb.0: 23; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm0 24; X64-AVX-NEXT: retq 25 %1 = mul <2 x i64> %a0, <i64 8, i64 8> 26 ret <2 x i64> %1 27} 28 29define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind { 30; SSE-LABEL: mul_v4i32_8: 31; SSE: # %bb.0: 32; SSE-NEXT: pslld $3, %xmm0 33; SSE-NEXT: ret{{[l|q]}} 34; 35; X64-AVX-LABEL: mul_v4i32_8: 36; X64-AVX: # %bb.0: 37; X64-AVX-NEXT: vpslld $3, %xmm0, %xmm0 38; X64-AVX-NEXT: retq 39 %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8> 40 ret <4 x i32> %1 41} 42 43define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind { 44; SSE-LABEL: mul_v8i16_8: 45; SSE: # %bb.0: 46; SSE-NEXT: psllw $3, %xmm0 47; SSE-NEXT: ret{{[l|q]}} 48; 49; X64-AVX-LABEL: mul_v8i16_8: 50; X64-AVX: # %bb.0: 51; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0 52; X64-AVX-NEXT: retq 53 %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 54 ret <8 x i16> %1 55} 56 57define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind { 58; X86-SSE-LABEL: mul_v16i8_32: 59; X86-SSE: # %bb.0: 60; X86-SSE-NEXT: psllw $5, %xmm0 61; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 62; X86-SSE-NEXT: retl 63; 64; X64-SSE-LABEL: mul_v16i8_32: 65; X64-SSE: # %bb.0: 66; X64-SSE-NEXT: psllw $5, %xmm0 67; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 68; X64-SSE-NEXT: retq 69; 70; X64-XOP-LABEL: mul_v16i8_32: 71; X64-XOP: # %bb.0: 72; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 73; X64-XOP-NEXT: retq 74; 75; X64-AVX2-LABEL: mul_v16i8_32: 76; X64-AVX2: # %bb.0: 77; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 78; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 79; X64-AVX2-NEXT: retq 80; 81; X64-AVX512DQ-LABEL: mul_v16i8_32: 82; X64-AVX512DQ: # %bb.0: 83; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm0 84; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 85; X64-AVX512DQ-NEXT: retq 86 %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32> 87 ret <16 x i8> %1 88} 89 90; 91; PowOf2 (non-uniform) 92; 93 94define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind { 95; SSE2-LABEL: mul_v2i64_32_8: 96; SSE2: # %bb.0: 97; SSE2-NEXT: movdqa %xmm0, %xmm1 98; SSE2-NEXT: psllq $5, %xmm1 99; SSE2-NEXT: psllq $3, %xmm0 100; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 101; SSE2-NEXT: ret{{[l|q]}} 102; 103; SSE4-LABEL: mul_v2i64_32_8: 104; SSE4: # %bb.0: 105; SSE4-NEXT: movdqa %xmm0, %xmm1 106; SSE4-NEXT: psllq $3, %xmm1 107; SSE4-NEXT: psllq $5, %xmm0 108; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 109; SSE4-NEXT: ret{{[l|q]}} 110; 111; X64-XOP-LABEL: mul_v2i64_32_8: 112; X64-XOP: # %bb.0: 113; X64-XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 114; X64-XOP-NEXT: retq 115; 116; X64-AVX2-LABEL: mul_v2i64_32_8: 117; X64-AVX2: # %bb.0: 118; X64-AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 119; X64-AVX2-NEXT: retq 120; 121; X64-AVX512DQ-LABEL: mul_v2i64_32_8: 122; X64-AVX512DQ: # %bb.0: 123; X64-AVX512DQ-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 124; X64-AVX512DQ-NEXT: retq 125 %1 = mul <2 x i64> %a0, <i64 32, i64 8> 126 ret <2 x i64> %1 127} 128 129define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind { 130; X86-SSE2-LABEL: mul_v4i32_1_2_4_8: 131; X86-SSE2: # %bb.0: 132; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 133; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 134; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 135; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 136; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 137; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 138; X86-SSE2-NEXT: retl 139; 140; X86-SSE4-LABEL: mul_v4i32_1_2_4_8: 141; X86-SSE4: # %bb.0: 142; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 143; X86-SSE4-NEXT: retl 144; 145; X64-SSE2-LABEL: mul_v4i32_1_2_4_8: 146; X64-SSE2: # %bb.0: 147; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 148; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 149; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 150; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 151; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 152; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 153; X64-SSE2-NEXT: retq 154; 155; X64-SSE4-LABEL: mul_v4i32_1_2_4_8: 156; X64-SSE4: # %bb.0: 157; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 158; X64-SSE4-NEXT: retq 159; 160; X64-XOP-LABEL: mul_v4i32_1_2_4_8: 161; X64-XOP: # %bb.0: 162; X64-XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 163; X64-XOP-NEXT: retq 164; 165; X64-AVX2-LABEL: mul_v4i32_1_2_4_8: 166; X64-AVX2: # %bb.0: 167; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 168; X64-AVX2-NEXT: retq 169; 170; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8: 171; X64-AVX512DQ: # %bb.0: 172; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 173; X64-AVX512DQ-NEXT: retq 174 %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8> 175 ret <4 x i32> %1 176} 177 178define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize { 179; SSE2-LABEL: mul_v4i32_1_2_4_8_optsize: 180; SSE2: # %bb.0: 181; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8] 182; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 183; SSE2-NEXT: pmuludq %xmm1, %xmm0 184; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 185; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 186; SSE2-NEXT: pmuludq %xmm2, %xmm1 187; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 188; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 189; SSE2-NEXT: ret{{[l|q]}} 190; 191; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize: 192; X86-SSE4: # %bb.0: 193; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 194; X86-SSE4-NEXT: retl 195; 196; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize: 197; X64-SSE4: # %bb.0: 198; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 199; X64-SSE4-NEXT: retq 200; 201; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize: 202; X64-XOP: # %bb.0: 203; X64-XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 204; X64-XOP-NEXT: retq 205; 206; X64-AVX2-LABEL: mul_v4i32_1_2_4_8_optsize: 207; X64-AVX2: # %bb.0: 208; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 209; X64-AVX2-NEXT: retq 210; 211; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8_optsize: 212; X64-AVX512DQ: # %bb.0: 213; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 214; X64-AVX512DQ-NEXT: retq 215 %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8> 216 ret <4 x i32> %1 217} 218 219define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind { 220; X86-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128: 221; X86-SSE: # %bb.0: 222; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] 223; X86-SSE-NEXT: retl 224; 225; X64-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128: 226; X64-SSE: # %bb.0: 227; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] 228; X64-SSE-NEXT: retq 229; 230; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128: 231; X64-XOP: # %bb.0: 232; X64-XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 233; X64-XOP-NEXT: retq 234; 235; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128: 236; X64-AVX2: # %bb.0: 237; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 238; X64-AVX2-NEXT: retq 239; 240; X64-AVX512DQ-LABEL: mul_v8i16_1_2_4_8_16_32_64_128: 241; X64-AVX512DQ: # %bb.0: 242; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 243; X64-AVX512DQ-NEXT: retq 244 %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128> 245 ret <8 x i16> %1 246} 247 248define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind { 249; SSE2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 250; SSE2: # %bb.0: 251; SSE2-NEXT: movdqa %xmm0, %xmm1 252; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 253; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8] 254; SSE2-NEXT: pmullw %xmm2, %xmm1 255; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 256; SSE2-NEXT: pand %xmm3, %xmm1 257; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 258; SSE2-NEXT: pmullw %xmm2, %xmm0 259; SSE2-NEXT: pand %xmm3, %xmm0 260; SSE2-NEXT: packuswb %xmm1, %xmm0 261; SSE2-NEXT: ret{{[l|q]}} 262; 263; X86-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 264; X86-SSE4: # %bb.0: 265; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 266; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] 267; X86-SSE4-NEXT: psllw $8, %xmm1 268; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] 269; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 270; X86-SSE4-NEXT: por %xmm1, %xmm0 271; X86-SSE4-NEXT: retl 272; 273; X64-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 274; X64-SSE4: # %bb.0: 275; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 276; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] 277; X64-SSE4-NEXT: psllw $8, %xmm1 278; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] 279; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 280; X64-SSE4-NEXT: por %xmm1, %xmm0 281; X64-SSE4-NEXT: retq 282; 283; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 284; X64-XOP: # %bb.0: 285; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 286; X64-XOP-NEXT: retq 287; 288; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 289; X64-AVX2: # %bb.0: 290; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 291; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] 292; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 293; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 294; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 295; X64-AVX2-NEXT: vzeroupper 296; X64-AVX2-NEXT: retq 297; 298; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: 299; X64-AVX512DQ: # %bb.0: 300; X64-AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 301; X64-AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 302; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 303; X64-AVX512DQ-NEXT: vzeroupper 304; X64-AVX512DQ-NEXT: retq 305 %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8> 306 ret <16 x i8> %1 307} 308 309; 310; PowOf2 + 1 (uniform) 311; 312 313define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind { 314; SSE-LABEL: mul_v2i64_17: 315; SSE: # %bb.0: 316; SSE-NEXT: movdqa %xmm0, %xmm1 317; SSE-NEXT: psllq $4, %xmm1 318; SSE-NEXT: paddq %xmm1, %xmm0 319; SSE-NEXT: ret{{[l|q]}} 320; 321; X64-AVX-LABEL: mul_v2i64_17: 322; X64-AVX: # %bb.0: 323; X64-AVX-NEXT: vpsllq $4, %xmm0, %xmm1 324; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 325; X64-AVX-NEXT: retq 326 %1 = mul <2 x i64> %a0, <i64 17, i64 17> 327 ret <2 x i64> %1 328} 329 330define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind { 331; SSE-LABEL: mul_v4i32_17: 332; SSE: # %bb.0: 333; SSE-NEXT: movdqa %xmm0, %xmm1 334; SSE-NEXT: pslld $4, %xmm1 335; SSE-NEXT: paddd %xmm1, %xmm0 336; SSE-NEXT: ret{{[l|q]}} 337; 338; X64-AVX-LABEL: mul_v4i32_17: 339; X64-AVX: # %bb.0: 340; X64-AVX-NEXT: vpslld $4, %xmm0, %xmm1 341; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 342; X64-AVX-NEXT: retq 343 %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17> 344 ret <4 x i32> %1 345} 346 347define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind { 348; SSE-LABEL: mul_v8i16_17: 349; SSE: # %bb.0: 350; SSE-NEXT: movdqa %xmm0, %xmm1 351; SSE-NEXT: psllw $4, %xmm1 352; SSE-NEXT: paddw %xmm1, %xmm0 353; SSE-NEXT: ret{{[l|q]}} 354; 355; X64-AVX-LABEL: mul_v8i16_17: 356; X64-AVX: # %bb.0: 357; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm1 358; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 359; X64-AVX-NEXT: retq 360 %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17> 361 ret <8 x i16> %1 362} 363 364define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind { 365; X86-SSE-LABEL: mul_v16i8_17: 366; X86-SSE: # %bb.0: 367; X86-SSE-NEXT: movdqa %xmm0, %xmm1 368; X86-SSE-NEXT: psllw $4, %xmm1 369; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 370; X86-SSE-NEXT: paddb %xmm1, %xmm0 371; X86-SSE-NEXT: retl 372; 373; X64-SSE-LABEL: mul_v16i8_17: 374; X64-SSE: # %bb.0: 375; X64-SSE-NEXT: movdqa %xmm0, %xmm1 376; X64-SSE-NEXT: psllw $4, %xmm1 377; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 378; X64-SSE-NEXT: paddb %xmm1, %xmm0 379; X64-SSE-NEXT: retq 380; 381; X64-XOP-LABEL: mul_v16i8_17: 382; X64-XOP: # %bb.0: 383; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 384; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 385; X64-XOP-NEXT: retq 386; 387; X64-AVX2-LABEL: mul_v16i8_17: 388; X64-AVX2: # %bb.0: 389; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1 390; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 391; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 392; X64-AVX2-NEXT: retq 393; 394; X64-AVX512DQ-LABEL: mul_v16i8_17: 395; X64-AVX512DQ: # %bb.0: 396; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 397; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 398; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 399; X64-AVX512DQ-NEXT: retq 400 %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17> 401 ret <16 x i8> %1 402} 403 404define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind { 405; SSE-LABEL: mul_v4i64_17: 406; SSE: # %bb.0: 407; SSE-NEXT: movdqa %xmm0, %xmm2 408; SSE-NEXT: psllq $4, %xmm2 409; SSE-NEXT: paddq %xmm2, %xmm0 410; SSE-NEXT: movdqa %xmm1, %xmm2 411; SSE-NEXT: psllq $4, %xmm2 412; SSE-NEXT: paddq %xmm2, %xmm1 413; SSE-NEXT: ret{{[l|q]}} 414; 415; X64-XOP-LABEL: mul_v4i64_17: 416; X64-XOP: # %bb.0: 417; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 418; X64-XOP-NEXT: vpsllq $4, %xmm1, %xmm2 419; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 420; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm2 421; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 422; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 423; X64-XOP-NEXT: retq 424; 425; X64-AVX2-LABEL: mul_v4i64_17: 426; X64-AVX2: # %bb.0: 427; X64-AVX2-NEXT: vpsllq $4, %ymm0, %ymm1 428; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 429; X64-AVX2-NEXT: retq 430; 431; X64-AVX512DQ-LABEL: mul_v4i64_17: 432; X64-AVX512DQ: # %bb.0: 433; X64-AVX512DQ-NEXT: vpsllq $4, %ymm0, %ymm1 434; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 435; X64-AVX512DQ-NEXT: retq 436 %1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17> 437 ret <4 x i64> %1 438} 439 440define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { 441; SSE-LABEL: mul_v8i32_17: 442; SSE: # %bb.0: 443; SSE-NEXT: movdqa %xmm0, %xmm2 444; SSE-NEXT: pslld $4, %xmm2 445; SSE-NEXT: paddd %xmm2, %xmm0 446; SSE-NEXT: movdqa %xmm1, %xmm2 447; SSE-NEXT: pslld $4, %xmm2 448; SSE-NEXT: paddd %xmm2, %xmm1 449; SSE-NEXT: ret{{[l|q]}} 450; 451; X64-XOP-LABEL: mul_v8i32_17: 452; X64-XOP: # %bb.0: 453; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 454; X64-XOP-NEXT: vpslld $4, %xmm1, %xmm2 455; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1 456; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm2 457; X64-XOP-NEXT: vpaddd %xmm0, %xmm2, %xmm0 458; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 459; X64-XOP-NEXT: retq 460; 461; X64-AVX2-LABEL: mul_v8i32_17: 462; X64-AVX2: # %bb.0: 463; X64-AVX2-NEXT: vpslld $4, %ymm0, %ymm1 464; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 465; X64-AVX2-NEXT: retq 466; 467; X64-AVX512DQ-LABEL: mul_v8i32_17: 468; X64-AVX512DQ: # %bb.0: 469; X64-AVX512DQ-NEXT: vpslld $4, %ymm0, %ymm1 470; X64-AVX512DQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 471; X64-AVX512DQ-NEXT: retq 472 %1 = mul <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 473 ret <8 x i32> %1 474} 475 476define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind { 477; SSE-LABEL: mul_v16i16_17: 478; SSE: # %bb.0: 479; SSE-NEXT: movdqa %xmm0, %xmm2 480; SSE-NEXT: psllw $4, %xmm2 481; SSE-NEXT: paddw %xmm2, %xmm0 482; SSE-NEXT: movdqa %xmm1, %xmm2 483; SSE-NEXT: psllw $4, %xmm2 484; SSE-NEXT: paddw %xmm2, %xmm1 485; SSE-NEXT: ret{{[l|q]}} 486; 487; X64-XOP-LABEL: mul_v16i16_17: 488; X64-XOP: # %bb.0: 489; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 490; X64-XOP-NEXT: vpsllw $4, %xmm1, %xmm2 491; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1 492; X64-XOP-NEXT: vpsllw $4, %xmm0, %xmm2 493; X64-XOP-NEXT: vpaddw %xmm0, %xmm2, %xmm0 494; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 495; X64-XOP-NEXT: retq 496; 497; X64-AVX2-LABEL: mul_v16i16_17: 498; X64-AVX2: # %bb.0: 499; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 500; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 501; X64-AVX2-NEXT: retq 502; 503; X64-AVX512DQ-LABEL: mul_v16i16_17: 504; X64-AVX512DQ: # %bb.0: 505; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 506; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 507; X64-AVX512DQ-NEXT: retq 508 %1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17> 509 ret <16 x i16> %1 510} 511 512define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind { 513; SSE-LABEL: mul_v32i8_17: 514; SSE: # %bb.0: 515; SSE-NEXT: movdqa %xmm0, %xmm2 516; SSE-NEXT: psllw $4, %xmm2 517; SSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 518; SSE-NEXT: pand %xmm3, %xmm2 519; SSE-NEXT: paddb %xmm2, %xmm0 520; SSE-NEXT: movdqa %xmm1, %xmm2 521; SSE-NEXT: psllw $4, %xmm2 522; SSE-NEXT: pand %xmm3, %xmm2 523; SSE-NEXT: paddb %xmm2, %xmm1 524; SSE-NEXT: ret{{[l|q]}} 525; 526; X64-XOP-LABEL: mul_v32i8_17: 527; X64-XOP: # %bb.0: 528; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 529; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 530; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3 531; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 532; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2 533; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 534; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 535; X64-XOP-NEXT: retq 536; 537; X64-AVX2-LABEL: mul_v32i8_17: 538; X64-AVX2: # %bb.0: 539; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 540; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 541; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 542; X64-AVX2-NEXT: retq 543; 544; X64-AVX512DQ-LABEL: mul_v32i8_17: 545; X64-AVX512DQ: # %bb.0: 546; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 547; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 548; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 549; X64-AVX512DQ-NEXT: retq 550 %1 = mul <32 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17> 551 ret <32 x i8> %1 552} 553 554; 555; -(PowOf2 + 1) (uniform) 556; 557 558define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind { 559; SSE-LABEL: mul_v2i64_neg1025: 560; SSE: # %bb.0: 561; SSE-NEXT: movdqa %xmm0, %xmm1 562; SSE-NEXT: psllq $10, %xmm1 563; SSE-NEXT: paddq %xmm0, %xmm1 564; SSE-NEXT: pxor %xmm0, %xmm0 565; SSE-NEXT: psubq %xmm1, %xmm0 566; SSE-NEXT: ret{{[l|q]}} 567; 568; X64-AVX-LABEL: mul_v2i64_neg1025: 569; X64-AVX: # %bb.0: 570; X64-AVX-NEXT: vpsllq $10, %xmm0, %xmm1 571; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 572; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 573; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 574; X64-AVX-NEXT: retq 575 %1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025> 576 ret <2 x i64> %1 577} 578 579define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind { 580; SSE-LABEL: mul_v4i32_neg33: 581; SSE: # %bb.0: 582; SSE-NEXT: movdqa %xmm0, %xmm1 583; SSE-NEXT: pslld $5, %xmm1 584; SSE-NEXT: paddd %xmm0, %xmm1 585; SSE-NEXT: pxor %xmm0, %xmm0 586; SSE-NEXT: psubd %xmm1, %xmm0 587; SSE-NEXT: ret{{[l|q]}} 588; 589; X64-AVX-LABEL: mul_v4i32_neg33: 590; X64-AVX: # %bb.0: 591; X64-AVX-NEXT: vpslld $5, %xmm0, %xmm1 592; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 593; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 594; X64-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 595; X64-AVX-NEXT: retq 596 %1 = mul <4 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33> 597 ret <4 x i32> %1 598} 599 600define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind { 601; SSE-LABEL: mul_v8i16_neg9: 602; SSE: # %bb.0: 603; SSE-NEXT: movdqa %xmm0, %xmm1 604; SSE-NEXT: psllw $3, %xmm1 605; SSE-NEXT: paddw %xmm0, %xmm1 606; SSE-NEXT: pxor %xmm0, %xmm0 607; SSE-NEXT: psubw %xmm1, %xmm0 608; SSE-NEXT: ret{{[l|q]}} 609; 610; X64-AVX-LABEL: mul_v8i16_neg9: 611; X64-AVX: # %bb.0: 612; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1 613; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 614; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 615; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 616; X64-AVX-NEXT: retq 617 %1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9> 618 ret <8 x i16> %1 619} 620 621define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind { 622; X86-SSE-LABEL: mul_v16i8_neg5: 623; X86-SSE: # %bb.0: 624; X86-SSE-NEXT: movdqa %xmm0, %xmm1 625; X86-SSE-NEXT: psllw $2, %xmm1 626; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 627; X86-SSE-NEXT: paddb %xmm0, %xmm1 628; X86-SSE-NEXT: pxor %xmm0, %xmm0 629; X86-SSE-NEXT: psubb %xmm1, %xmm0 630; X86-SSE-NEXT: retl 631; 632; X64-SSE-LABEL: mul_v16i8_neg5: 633; X64-SSE: # %bb.0: 634; X64-SSE-NEXT: movdqa %xmm0, %xmm1 635; X64-SSE-NEXT: psllw $2, %xmm1 636; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 637; X64-SSE-NEXT: paddb %xmm0, %xmm1 638; X64-SSE-NEXT: pxor %xmm0, %xmm0 639; X64-SSE-NEXT: psubb %xmm1, %xmm0 640; X64-SSE-NEXT: retq 641; 642; X64-XOP-LABEL: mul_v16i8_neg5: 643; X64-XOP: # %bb.0: 644; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 645; X64-XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 646; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 647; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0 648; X64-XOP-NEXT: retq 649; 650; X64-AVX2-LABEL: mul_v16i8_neg5: 651; X64-AVX2: # %bb.0: 652; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 653; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 654; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 655; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 656; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 657; X64-AVX2-NEXT: retq 658; 659; X64-AVX512DQ-LABEL: mul_v16i8_neg5: 660; X64-AVX512DQ: # %bb.0: 661; X64-AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm1 662; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 663; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 664; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 665; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 666; X64-AVX512DQ-NEXT: retq 667 %1 = mul <16 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5> 668 ret <16 x i8> %1 669} 670 671define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind { 672; SSE-LABEL: mul_v4i64_neg1025: 673; SSE: # %bb.0: 674; SSE-NEXT: movdqa %xmm0, %xmm3 675; SSE-NEXT: psllq $10, %xmm3 676; SSE-NEXT: paddq %xmm0, %xmm3 677; SSE-NEXT: pxor %xmm2, %xmm2 678; SSE-NEXT: pxor %xmm0, %xmm0 679; SSE-NEXT: psubq %xmm3, %xmm0 680; SSE-NEXT: movdqa %xmm1, %xmm3 681; SSE-NEXT: psllq $10, %xmm3 682; SSE-NEXT: paddq %xmm1, %xmm3 683; SSE-NEXT: psubq %xmm3, %xmm2 684; SSE-NEXT: movdqa %xmm2, %xmm1 685; SSE-NEXT: ret{{[l|q]}} 686; 687; X64-XOP-LABEL: mul_v4i64_neg1025: 688; X64-XOP: # %bb.0: 689; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 690; X64-XOP-NEXT: vpsllq $10, %xmm1, %xmm2 691; X64-XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 692; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 693; X64-XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 694; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm3 695; X64-XOP-NEXT: vpaddq %xmm0, %xmm3, %xmm0 696; X64-XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 697; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 698; X64-XOP-NEXT: retq 699; 700; X64-AVX2-LABEL: mul_v4i64_neg1025: 701; X64-AVX2: # %bb.0: 702; X64-AVX2-NEXT: vpsllq $10, %ymm0, %ymm1 703; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 704; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 705; X64-AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 706; X64-AVX2-NEXT: retq 707; 708; X64-AVX512DQ-LABEL: mul_v4i64_neg1025: 709; X64-AVX512DQ: # %bb.0: 710; X64-AVX512DQ-NEXT: vpsllq $10, %ymm0, %ymm1 711; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 712; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 713; X64-AVX512DQ-NEXT: vpsubq %ymm0, %ymm1, %ymm0 714; X64-AVX512DQ-NEXT: retq 715 %1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025> 716 ret <4 x i64> %1 717} 718 719define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { 720; SSE-LABEL: mul_v8i32_neg33: 721; SSE: # %bb.0: 722; SSE-NEXT: movdqa %xmm0, %xmm3 723; SSE-NEXT: pslld $5, %xmm3 724; SSE-NEXT: paddd %xmm0, %xmm3 725; SSE-NEXT: pxor %xmm2, %xmm2 726; SSE-NEXT: pxor %xmm0, %xmm0 727; SSE-NEXT: psubd %xmm3, %xmm0 728; SSE-NEXT: movdqa %xmm1, %xmm3 729; SSE-NEXT: pslld $5, %xmm3 730; SSE-NEXT: paddd %xmm1, %xmm3 731; SSE-NEXT: psubd %xmm3, %xmm2 732; SSE-NEXT: movdqa %xmm2, %xmm1 733; SSE-NEXT: ret{{[l|q]}} 734; 735; X64-XOP-LABEL: mul_v8i32_neg33: 736; X64-XOP: # %bb.0: 737; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 738; X64-XOP-NEXT: vpslld $5, %xmm1, %xmm2 739; X64-XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1 740; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 741; X64-XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 742; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm3 743; X64-XOP-NEXT: vpaddd %xmm0, %xmm3, %xmm0 744; X64-XOP-NEXT: vpsubd %xmm0, %xmm2, %xmm0 745; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 746; X64-XOP-NEXT: retq 747; 748; X64-AVX2-LABEL: mul_v8i32_neg33: 749; X64-AVX2: # %bb.0: 750; X64-AVX2-NEXT: vpslld $5, %ymm0, %ymm1 751; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 752; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 753; X64-AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 754; X64-AVX2-NEXT: retq 755; 756; X64-AVX512DQ-LABEL: mul_v8i32_neg33: 757; X64-AVX512DQ: # %bb.0: 758; X64-AVX512DQ-NEXT: vpslld $5, %ymm0, %ymm1 759; X64-AVX512DQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 760; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 761; X64-AVX512DQ-NEXT: vpsubd %ymm0, %ymm1, %ymm0 762; X64-AVX512DQ-NEXT: retq 763 %1 = mul <8 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33> 764 ret <8 x i32> %1 765} 766 767define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind { 768; SSE-LABEL: mul_v16i16_neg9: 769; SSE: # %bb.0: 770; SSE-NEXT: movdqa %xmm0, %xmm3 771; SSE-NEXT: psllw $3, %xmm3 772; SSE-NEXT: paddw %xmm0, %xmm3 773; SSE-NEXT: pxor %xmm2, %xmm2 774; SSE-NEXT: pxor %xmm0, %xmm0 775; SSE-NEXT: psubw %xmm3, %xmm0 776; SSE-NEXT: movdqa %xmm1, %xmm3 777; SSE-NEXT: psllw $3, %xmm3 778; SSE-NEXT: paddw %xmm1, %xmm3 779; SSE-NEXT: psubw %xmm3, %xmm2 780; SSE-NEXT: movdqa %xmm2, %xmm1 781; SSE-NEXT: ret{{[l|q]}} 782; 783; X64-XOP-LABEL: mul_v16i16_neg9: 784; X64-XOP: # %bb.0: 785; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 786; X64-XOP-NEXT: vpsllw $3, %xmm1, %xmm2 787; X64-XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1 788; X64-XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 789; X64-XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 790; X64-XOP-NEXT: vpsllw $3, %xmm0, %xmm3 791; X64-XOP-NEXT: vpaddw %xmm0, %xmm3, %xmm0 792; X64-XOP-NEXT: vpsubw %xmm0, %xmm2, %xmm0 793; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 794; X64-XOP-NEXT: retq 795; 796; X64-AVX2-LABEL: mul_v16i16_neg9: 797; X64-AVX2: # %bb.0: 798; X64-AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 799; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 800; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 801; X64-AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0 802; X64-AVX2-NEXT: retq 803; 804; X64-AVX512DQ-LABEL: mul_v16i16_neg9: 805; X64-AVX512DQ: # %bb.0: 806; X64-AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 807; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 808; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 809; X64-AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0 810; X64-AVX512DQ-NEXT: retq 811 %1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9> 812 ret <16 x i16> %1 813} 814 815define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind { 816; SSE-LABEL: mul_v32i8_neg5: 817; SSE: # %bb.0: 818; SSE-NEXT: movdqa %xmm0, %xmm3 819; SSE-NEXT: psllw $2, %xmm3 820; SSE-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 821; SSE-NEXT: pand %xmm4, %xmm3 822; SSE-NEXT: paddb %xmm0, %xmm3 823; SSE-NEXT: pxor %xmm2, %xmm2 824; SSE-NEXT: pxor %xmm0, %xmm0 825; SSE-NEXT: psubb %xmm3, %xmm0 826; SSE-NEXT: movdqa %xmm1, %xmm3 827; SSE-NEXT: psllw $2, %xmm3 828; SSE-NEXT: pand %xmm4, %xmm3 829; SSE-NEXT: paddb %xmm1, %xmm3 830; SSE-NEXT: psubb %xmm3, %xmm2 831; SSE-NEXT: movdqa %xmm2, %xmm1 832; SSE-NEXT: ret{{[l|q]}} 833; 834; X64-XOP-LABEL: mul_v32i8_neg5: 835; X64-XOP: # %bb.0: 836; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 837; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 838; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3 839; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 840; X64-XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 841; X64-XOP-NEXT: vpsubb %xmm1, %xmm3, %xmm1 842; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2 843; X64-XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 844; X64-XOP-NEXT: vpsubb %xmm0, %xmm3, %xmm0 845; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 846; X64-XOP-NEXT: retq 847; 848; X64-AVX2-LABEL: mul_v32i8_neg5: 849; X64-AVX2: # %bb.0: 850; X64-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 851; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 852; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 853; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 854; X64-AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0 855; X64-AVX2-NEXT: retq 856; 857; X64-AVX512DQ-LABEL: mul_v32i8_neg5: 858; X64-AVX512DQ: # %bb.0: 859; X64-AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1 860; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 861; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 862; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 863; X64-AVX512DQ-NEXT: vpsubb %ymm0, %ymm1, %ymm0 864; X64-AVX512DQ-NEXT: retq 865 %1 = mul <32 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5> 866 ret <32 x i8> %1 867} 868 869; 870; PowOf2 + 1 (non-uniform) 871; 872 873define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { 874; X86-SSE2-LABEL: mul_v2i64_17_65: 875; X86-SSE2: # %bb.0: 876; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] 877; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 878; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 879; X86-SSE2-NEXT: psrlq $32, %xmm0 880; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 881; X86-SSE2-NEXT: psllq $32, %xmm0 882; X86-SSE2-NEXT: paddq %xmm2, %xmm0 883; X86-SSE2-NEXT: retl 884; 885; SSE4-LABEL: mul_v2i64_17_65: 886; SSE4: # %bb.0: 887; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [17,65] 888; SSE4-NEXT: movdqa %xmm0, %xmm2 889; SSE4-NEXT: pmuludq %xmm1, %xmm2 890; SSE4-NEXT: psrlq $32, %xmm0 891; SSE4-NEXT: pmuludq %xmm1, %xmm0 892; SSE4-NEXT: psllq $32, %xmm0 893; SSE4-NEXT: paddq %xmm2, %xmm0 894; SSE4-NEXT: ret{{[l|q]}} 895; 896; X64-SSE2-LABEL: mul_v2i64_17_65: 897; X64-SSE2: # %bb.0: 898; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,65] 899; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 900; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 901; X64-SSE2-NEXT: psrlq $32, %xmm0 902; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 903; X64-SSE2-NEXT: psllq $32, %xmm0 904; X64-SSE2-NEXT: paddq %xmm2, %xmm0 905; X64-SSE2-NEXT: retq 906; 907; X64-XOP-LABEL: mul_v2i64_17_65: 908; X64-XOP: # %bb.0: 909; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] 910; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 911; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 912; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 913; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 914; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 915; X64-XOP-NEXT: retq 916; 917; X64-AVX2-LABEL: mul_v2i64_17_65: 918; X64-AVX2: # %bb.0: 919; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] 920; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 921; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 922; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 923; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 924; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 925; X64-AVX2-NEXT: retq 926; 927; X64-AVX512DQ-LABEL: mul_v2i64_17_65: 928; X64-AVX512DQ: # %bb.0: 929; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 930; X64-AVX512DQ-NEXT: retq 931 %1 = mul <2 x i64> %a0, <i64 17, i64 65> 932 ret <2 x i64> %1 933} 934 935define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind { 936; X86-SSE2-LABEL: mul_v4i32_5_17_33_65: 937; X86-SSE2: # %bb.0: 938; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 939; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 940; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 941; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 942; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 943; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 944; X86-SSE2-NEXT: retl 945; 946; X86-SSE4-LABEL: mul_v4i32_5_17_33_65: 947; X86-SSE4: # %bb.0: 948; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 949; X86-SSE4-NEXT: retl 950; 951; X64-SSE2-LABEL: mul_v4i32_5_17_33_65: 952; X64-SSE2: # %bb.0: 953; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 954; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 955; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 956; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 957; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 958; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 959; X64-SSE2-NEXT: retq 960; 961; X64-SSE4-LABEL: mul_v4i32_5_17_33_65: 962; X64-SSE4: # %bb.0: 963; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 964; X64-SSE4-NEXT: retq 965; 966; X64-AVX-LABEL: mul_v4i32_5_17_33_65: 967; X64-AVX: # %bb.0: 968; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 969; X64-AVX-NEXT: retq 970 %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65> 971 ret <4 x i32> %1 972} 973 974define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind { 975; X86-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257: 976; X86-SSE: # %bb.0: 977; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,257] 978; X86-SSE-NEXT: retl 979; 980; X64-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257: 981; X64-SSE: # %bb.0: 982; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,257] 983; X64-SSE-NEXT: retq 984; 985; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257: 986; X64-AVX: # %bb.0: 987; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,257] 988; X64-AVX-NEXT: retq 989 %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257> 990 ret <8 x i16> %1 991} 992 993define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind { 994; X86-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 995; X86-SSE2: # %bb.0: 996; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 997; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 998; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,9,17,33,65,129,2,3] 999; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1000; X86-SSE2-NEXT: pand %xmm2, %xmm1 1001; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1002; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2] 1003; X86-SSE2-NEXT: pand %xmm2, %xmm0 1004; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 1005; X86-SSE2-NEXT: retl 1006; 1007; X86-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1008; X86-SSE4: # %bb.0: 1009; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 1010; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] 1011; X86-SSE4-NEXT: psllw $8, %xmm1 1012; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] 1013; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1014; X86-SSE4-NEXT: por %xmm1, %xmm0 1015; X86-SSE4-NEXT: retl 1016; 1017; X64-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1018; X64-SSE2: # %bb.0: 1019; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 1020; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1021; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,9,17,33,65,129,2,3] 1022; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1023; X64-SSE2-NEXT: pand %xmm2, %xmm1 1024; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1025; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2] 1026; X64-SSE2-NEXT: pand %xmm2, %xmm0 1027; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 1028; X64-SSE2-NEXT: retq 1029; 1030; X64-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1031; X64-SSE4: # %bb.0: 1032; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 1033; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] 1034; X64-SSE4-NEXT: psllw $8, %xmm1 1035; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] 1036; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1037; X64-SSE4-NEXT: por %xmm1, %xmm0 1038; X64-SSE4-NEXT: retq 1039; 1040; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1041; X64-XOP: # %bb.0: 1042; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] 1043; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] 1044; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] 1045; X64-XOP-NEXT: retq 1046; 1047; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1048; X64-AVX2: # %bb.0: 1049; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1050; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] 1051; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1052; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1053; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1054; X64-AVX2-NEXT: vzeroupper 1055; X64-AVX2-NEXT: retq 1056; 1057; X64-AVX512DQ-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: 1058; X64-AVX512DQ: # %bb.0: 1059; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1060; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] 1061; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1062; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1063; X64-AVX512DQ-NEXT: vzeroupper 1064; X64-AVX512DQ-NEXT: retq 1065 %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3> 1066 ret <16 x i8> %1 1067} 1068 1069; 1070; PowOf2 - 1 (uniform) 1071; 1072 1073define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind { 1074; SSE-LABEL: mul_v2i64_7: 1075; SSE: # %bb.0: 1076; SSE-NEXT: movdqa %xmm0, %xmm1 1077; SSE-NEXT: psllq $3, %xmm1 1078; SSE-NEXT: psubq %xmm0, %xmm1 1079; SSE-NEXT: movdqa %xmm1, %xmm0 1080; SSE-NEXT: ret{{[l|q]}} 1081; 1082; X64-AVX-LABEL: mul_v2i64_7: 1083; X64-AVX: # %bb.0: 1084; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1 1085; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 1086; X64-AVX-NEXT: retq 1087 %1 = mul <2 x i64> %a0, <i64 7, i64 7> 1088 ret <2 x i64> %1 1089} 1090 1091define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind { 1092; SSE-LABEL: mul_v4i32_7: 1093; SSE: # %bb.0: 1094; SSE-NEXT: movdqa %xmm0, %xmm1 1095; SSE-NEXT: pslld $3, %xmm1 1096; SSE-NEXT: psubd %xmm0, %xmm1 1097; SSE-NEXT: movdqa %xmm1, %xmm0 1098; SSE-NEXT: ret{{[l|q]}} 1099; 1100; X64-AVX-LABEL: mul_v4i32_7: 1101; X64-AVX: # %bb.0: 1102; X64-AVX-NEXT: vpslld $3, %xmm0, %xmm1 1103; X64-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 1104; X64-AVX-NEXT: retq 1105 %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7> 1106 ret <4 x i32> %1 1107} 1108 1109define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind { 1110; SSE-LABEL: mul_v8i16_7: 1111; SSE: # %bb.0: 1112; SSE-NEXT: movdqa %xmm0, %xmm1 1113; SSE-NEXT: psllw $3, %xmm1 1114; SSE-NEXT: psubw %xmm0, %xmm1 1115; SSE-NEXT: movdqa %xmm1, %xmm0 1116; SSE-NEXT: ret{{[l|q]}} 1117; 1118; X64-AVX-LABEL: mul_v8i16_7: 1119; X64-AVX: # %bb.0: 1120; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1 1121; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 1122; X64-AVX-NEXT: retq 1123 %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1124 ret <8 x i16> %1 1125} 1126 1127define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind { 1128; X86-SSE-LABEL: mul_v16i8_31: 1129; X86-SSE: # %bb.0: 1130; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1131; X86-SSE-NEXT: psllw $5, %xmm1 1132; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1133; X86-SSE-NEXT: psubb %xmm0, %xmm1 1134; X86-SSE-NEXT: movdqa %xmm1, %xmm0 1135; X86-SSE-NEXT: retl 1136; 1137; X64-SSE-LABEL: mul_v16i8_31: 1138; X64-SSE: # %bb.0: 1139; X64-SSE-NEXT: movdqa %xmm0, %xmm1 1140; X64-SSE-NEXT: psllw $5, %xmm1 1141; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1142; X64-SSE-NEXT: psubb %xmm0, %xmm1 1143; X64-SSE-NEXT: movdqa %xmm1, %xmm0 1144; X64-SSE-NEXT: retq 1145; 1146; X64-XOP-LABEL: mul_v16i8_31: 1147; X64-XOP: # %bb.0: 1148; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1149; X64-XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1150; X64-XOP-NEXT: retq 1151; 1152; X64-AVX2-LABEL: mul_v16i8_31: 1153; X64-AVX2: # %bb.0: 1154; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm1 1155; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1156; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1157; X64-AVX2-NEXT: retq 1158; 1159; X64-AVX512DQ-LABEL: mul_v16i8_31: 1160; X64-AVX512DQ: # %bb.0: 1161; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm1 1162; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1163; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1164; X64-AVX512DQ-NEXT: retq 1165 %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31> 1166 ret <16 x i8> %1 1167} 1168 1169; 1170; -(PowOf2 - 1) (uniform) 1171; 1172 1173define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind { 1174; SSE-LABEL: mul_v2i64_neg7: 1175; SSE: # %bb.0: 1176; SSE-NEXT: movdqa %xmm0, %xmm1 1177; SSE-NEXT: psllq $3, %xmm1 1178; SSE-NEXT: psubq %xmm1, %xmm0 1179; SSE-NEXT: ret{{[l|q]}} 1180; 1181; X64-AVX-LABEL: mul_v2i64_neg7: 1182; X64-AVX: # %bb.0: 1183; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1 1184; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1185; X64-AVX-NEXT: retq 1186 %1 = mul <2 x i64> %a0, <i64 -7, i64 -7> 1187 ret <2 x i64> %1 1188} 1189 1190define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind { 1191; SSE-LABEL: mul_v4i32_neg63: 1192; SSE: # %bb.0: 1193; SSE-NEXT: movdqa %xmm0, %xmm1 1194; SSE-NEXT: pslld $6, %xmm1 1195; SSE-NEXT: psubd %xmm1, %xmm0 1196; SSE-NEXT: ret{{[l|q]}} 1197; 1198; X64-AVX-LABEL: mul_v4i32_neg63: 1199; X64-AVX: # %bb.0: 1200; X64-AVX-NEXT: vpslld $6, %xmm0, %xmm1 1201; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1202; X64-AVX-NEXT: retq 1203 %1 = mul <4 x i32> %a0, <i32 -63, i32 -63, i32 -63, i32 -63> 1204 ret <4 x i32> %1 1205} 1206 1207define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind { 1208; SSE-LABEL: mul_v8i16_neg31: 1209; SSE: # %bb.0: 1210; SSE-NEXT: movdqa %xmm0, %xmm1 1211; SSE-NEXT: psllw $5, %xmm1 1212; SSE-NEXT: psubw %xmm1, %xmm0 1213; SSE-NEXT: ret{{[l|q]}} 1214; 1215; X64-AVX-LABEL: mul_v8i16_neg31: 1216; X64-AVX: # %bb.0: 1217; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm1 1218; X64-AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1219; X64-AVX-NEXT: retq 1220 %1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31> 1221 ret <8 x i16> %1 1222} 1223 1224define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind { 1225; X86-SSE-LABEL: mul_v16i8_neg15: 1226; X86-SSE: # %bb.0: 1227; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1228; X86-SSE-NEXT: psllw $4, %xmm1 1229; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1230; X86-SSE-NEXT: psubb %xmm1, %xmm0 1231; X86-SSE-NEXT: retl 1232; 1233; X64-SSE-LABEL: mul_v16i8_neg15: 1234; X64-SSE: # %bb.0: 1235; X64-SSE-NEXT: movdqa %xmm0, %xmm1 1236; X64-SSE-NEXT: psllw $4, %xmm1 1237; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1238; X64-SSE-NEXT: psubb %xmm1, %xmm0 1239; X64-SSE-NEXT: retq 1240; 1241; X64-XOP-LABEL: mul_v16i8_neg15: 1242; X64-XOP: # %bb.0: 1243; X64-XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1244; X64-XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1245; X64-XOP-NEXT: retq 1246; 1247; X64-AVX2-LABEL: mul_v16i8_neg15: 1248; X64-AVX2: # %bb.0: 1249; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1 1250; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1251; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1252; X64-AVX2-NEXT: retq 1253; 1254; X64-AVX512DQ-LABEL: mul_v16i8_neg15: 1255; X64-AVX512DQ: # %bb.0: 1256; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 1257; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1258; X64-AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1259; X64-AVX512DQ-NEXT: retq 1260 %1 = mul <16 x i8> %a0, <i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15> 1261 ret <16 x i8> %1 1262} 1263 1264; 1265; PowOf2 - 1 (non-uniform) 1266; 1267 1268define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { 1269; X86-SSE2-LABEL: mul_v2i64_15_63: 1270; X86-SSE2: # %bb.0: 1271; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] 1272; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1273; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 1274; X86-SSE2-NEXT: psrlq $32, %xmm0 1275; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1276; X86-SSE2-NEXT: psllq $32, %xmm0 1277; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1278; X86-SSE2-NEXT: retl 1279; 1280; SSE4-LABEL: mul_v2i64_15_63: 1281; SSE4: # %bb.0: 1282; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,63] 1283; SSE4-NEXT: movdqa %xmm0, %xmm2 1284; SSE4-NEXT: pmuludq %xmm1, %xmm2 1285; SSE4-NEXT: psrlq $32, %xmm0 1286; SSE4-NEXT: pmuludq %xmm1, %xmm0 1287; SSE4-NEXT: psllq $32, %xmm0 1288; SSE4-NEXT: paddq %xmm2, %xmm0 1289; SSE4-NEXT: ret{{[l|q]}} 1290; 1291; X64-SSE2-LABEL: mul_v2i64_15_63: 1292; X64-SSE2: # %bb.0: 1293; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,63] 1294; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1295; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1296; X64-SSE2-NEXT: psrlq $32, %xmm0 1297; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 1298; X64-SSE2-NEXT: psllq $32, %xmm0 1299; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1300; X64-SSE2-NEXT: retq 1301; 1302; X64-XOP-LABEL: mul_v2i64_15_63: 1303; X64-XOP: # %bb.0: 1304; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] 1305; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1306; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 1307; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1308; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1309; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1310; X64-XOP-NEXT: retq 1311; 1312; X64-AVX2-LABEL: mul_v2i64_15_63: 1313; X64-AVX2: # %bb.0: 1314; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] 1315; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1316; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 1317; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1318; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1319; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1320; X64-AVX2-NEXT: retq 1321; 1322; X64-AVX512DQ-LABEL: mul_v2i64_15_63: 1323; X64-AVX512DQ: # %bb.0: 1324; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1325; X64-AVX512DQ-NEXT: retq 1326 %1 = mul <2 x i64> %a0, <i64 15, i64 63> 1327 ret <2 x i64> %1 1328} 1329 1330define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { 1331; X86-SSE2-LABEL: mul_v2i64_neg_15_63: 1332; X86-SSE2: # %bb.0: 1333; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1334; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 1335; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1336; X86-SSE2-NEXT: psrlq $32, %xmm2 1337; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] 1338; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 1339; X86-SSE2-NEXT: paddq %xmm1, %xmm2 1340; X86-SSE2-NEXT: psllq $32, %xmm2 1341; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 1342; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1343; X86-SSE2-NEXT: retl 1344; 1345; X86-SSE4-LABEL: mul_v2i64_neg_15_63: 1346; X86-SSE4: # %bb.0: 1347; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 1348; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 1349; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 1350; X86-SSE4-NEXT: psrlq $32, %xmm2 1351; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] 1352; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 1353; X86-SSE4-NEXT: paddq %xmm1, %xmm2 1354; X86-SSE4-NEXT: psllq $32, %xmm2 1355; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 1356; X86-SSE4-NEXT: paddq %xmm2, %xmm0 1357; X86-SSE4-NEXT: retl 1358; 1359; X64-SSE2-LABEL: mul_v2i64_neg_15_63: 1360; X64-SSE2: # %bb.0: 1361; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] 1362; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1363; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1364; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 1365; X64-SSE2-NEXT: psrlq $32, %xmm3 1366; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 1367; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1368; X64-SSE2-NEXT: paddq %xmm3, %xmm0 1369; X64-SSE2-NEXT: psllq $32, %xmm0 1370; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1371; X64-SSE2-NEXT: retq 1372; 1373; X64-SSE4-LABEL: mul_v2i64_neg_15_63: 1374; X64-SSE4: # %bb.0: 1375; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] 1376; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 1377; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 1378; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 1379; X64-SSE4-NEXT: psrlq $32, %xmm3 1380; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 1381; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1382; X64-SSE4-NEXT: paddq %xmm3, %xmm0 1383; X64-SSE4-NEXT: psllq $32, %xmm0 1384; X64-SSE4-NEXT: paddq %xmm2, %xmm0 1385; X64-SSE4-NEXT: retq 1386; 1387; X64-XOP-LABEL: mul_v2i64_neg_15_63: 1388; X64-XOP: # %bb.0: 1389; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] 1390; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1391; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 1392; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1393; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1394; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1395; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1396; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1397; X64-XOP-NEXT: retq 1398; 1399; X64-AVX2-LABEL: mul_v2i64_neg_15_63: 1400; X64-AVX2: # %bb.0: 1401; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] 1402; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1403; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 1404; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1405; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1406; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1407; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1408; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1409; X64-AVX2-NEXT: retq 1410; 1411; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63: 1412; X64-AVX512DQ: # %bb.0: 1413; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1414; X64-AVX512DQ-NEXT: retq 1415 %1 = mul <2 x i64> %a0, <i64 -15, i64 -63> 1416 ret <2 x i64> %1 1417} 1418 1419define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { 1420; X86-SSE2-LABEL: mul_v2i64_neg_17_65: 1421; X86-SSE2: # %bb.0: 1422; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 1423; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 1424; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1425; X86-SSE2-NEXT: psrlq $32, %xmm2 1426; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] 1427; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 1428; X86-SSE2-NEXT: paddq %xmm1, %xmm2 1429; X86-SSE2-NEXT: psllq $32, %xmm2 1430; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 1431; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1432; X86-SSE2-NEXT: retl 1433; 1434; X86-SSE4-LABEL: mul_v2i64_neg_17_65: 1435; X86-SSE4: # %bb.0: 1436; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 1437; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 1438; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 1439; X86-SSE4-NEXT: psrlq $32, %xmm2 1440; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] 1441; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 1442; X86-SSE4-NEXT: paddq %xmm1, %xmm2 1443; X86-SSE4-NEXT: psllq $32, %xmm2 1444; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 1445; X86-SSE4-NEXT: paddq %xmm2, %xmm0 1446; X86-SSE4-NEXT: retl 1447; 1448; X64-SSE2-LABEL: mul_v2i64_neg_17_65: 1449; X64-SSE2: # %bb.0: 1450; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] 1451; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1452; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1453; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 1454; X64-SSE2-NEXT: psrlq $32, %xmm3 1455; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 1456; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1457; X64-SSE2-NEXT: paddq %xmm3, %xmm0 1458; X64-SSE2-NEXT: psllq $32, %xmm0 1459; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1460; X64-SSE2-NEXT: retq 1461; 1462; X64-SSE4-LABEL: mul_v2i64_neg_17_65: 1463; X64-SSE4: # %bb.0: 1464; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] 1465; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 1466; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 1467; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 1468; X64-SSE4-NEXT: psrlq $32, %xmm3 1469; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 1470; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1471; X64-SSE4-NEXT: paddq %xmm3, %xmm0 1472; X64-SSE4-NEXT: psllq $32, %xmm0 1473; X64-SSE4-NEXT: paddq %xmm2, %xmm0 1474; X64-SSE4-NEXT: retq 1475; 1476; X64-XOP-LABEL: mul_v2i64_neg_17_65: 1477; X64-XOP: # %bb.0: 1478; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] 1479; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1480; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 1481; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1482; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1483; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1484; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1485; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1486; X64-XOP-NEXT: retq 1487; 1488; X64-AVX2-LABEL: mul_v2i64_neg_17_65: 1489; X64-AVX2: # %bb.0: 1490; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] 1491; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1492; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 1493; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1494; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1495; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1496; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1497; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1498; X64-AVX2-NEXT: retq 1499; 1500; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65: 1501; X64-AVX512DQ: # %bb.0: 1502; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1503; X64-AVX512DQ-NEXT: retq 1504 %1 = mul <2 x i64> %a0, <i64 -17, i64 -65> 1505 ret <2 x i64> %1 1506} 1507 1508define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { 1509; X86-SSE2-LABEL: mul_v2i64_0_1: 1510; X86-SSE2: # %bb.0: 1511; X86-SSE2-NEXT: xorpd %xmm1, %xmm1 1512; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1513; X86-SSE2-NEXT: retl 1514; 1515; SSE4-LABEL: mul_v2i64_0_1: 1516; SSE4: # %bb.0: 1517; SSE4-NEXT: xorps %xmm1, %xmm1 1518; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1519; SSE4-NEXT: ret{{[l|q]}} 1520; 1521; X64-SSE2-LABEL: mul_v2i64_0_1: 1522; X64-SSE2: # %bb.0: 1523; X64-SSE2-NEXT: xorps %xmm1, %xmm1 1524; X64-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1525; X64-SSE2-NEXT: movaps %xmm1, %xmm0 1526; X64-SSE2-NEXT: retq 1527; 1528; X64-AVX-LABEL: mul_v2i64_0_1: 1529; X64-AVX: # %bb.0: 1530; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1531; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1532; X64-AVX-NEXT: retq 1533 %1 = mul <2 x i64> %a0, <i64 0, i64 1> 1534 ret <2 x i64> %1 1535} 1536 1537define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { 1538; X86-SSE2-LABEL: mul_v2i64_neg_0_1: 1539; X86-SSE2: # %bb.0: 1540; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] 1541; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1542; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 1543; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1544; X86-SSE2-NEXT: psrlq $32, %xmm3 1545; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 1546; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1547; X86-SSE2-NEXT: paddq %xmm3, %xmm0 1548; X86-SSE2-NEXT: psllq $32, %xmm0 1549; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1550; X86-SSE2-NEXT: retl 1551; 1552; X86-SSE4-LABEL: mul_v2i64_neg_0_1: 1553; X86-SSE4: # %bb.0: 1554; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] 1555; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 1556; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 1557; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 1558; X86-SSE4-NEXT: psrlq $32, %xmm3 1559; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 1560; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1561; X86-SSE4-NEXT: paddq %xmm3, %xmm0 1562; X86-SSE4-NEXT: psllq $32, %xmm0 1563; X86-SSE4-NEXT: paddq %xmm2, %xmm0 1564; X86-SSE4-NEXT: retl 1565; 1566; X64-SSE2-LABEL: mul_v2i64_neg_0_1: 1567; X64-SSE2: # %bb.0: 1568; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] 1569; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1570; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1571; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 1572; X64-SSE2-NEXT: psrlq $32, %xmm3 1573; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 1574; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1575; X64-SSE2-NEXT: paddq %xmm3, %xmm0 1576; X64-SSE2-NEXT: psllq $32, %xmm0 1577; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1578; X64-SSE2-NEXT: retq 1579; 1580; X64-SSE4-LABEL: mul_v2i64_neg_0_1: 1581; X64-SSE4: # %bb.0: 1582; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] 1583; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 1584; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 1585; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 1586; X64-SSE4-NEXT: psrlq $32, %xmm3 1587; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 1588; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1589; X64-SSE4-NEXT: paddq %xmm3, %xmm0 1590; X64-SSE4-NEXT: psllq $32, %xmm0 1591; X64-SSE4-NEXT: paddq %xmm2, %xmm0 1592; X64-SSE4-NEXT: retq 1593; 1594; X64-XOP-LABEL: mul_v2i64_neg_0_1: 1595; X64-XOP: # %bb.0: 1596; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] 1597; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1598; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 1599; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1600; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1601; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1602; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1603; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1604; X64-XOP-NEXT: retq 1605; 1606; X64-AVX2-LABEL: mul_v2i64_neg_0_1: 1607; X64-AVX2: # %bb.0: 1608; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] 1609; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1610; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 1611; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1612; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1613; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1614; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1615; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1616; X64-AVX2-NEXT: retq 1617; 1618; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1: 1619; X64-AVX512DQ: # %bb.0: 1620; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1621; X64-AVX512DQ-NEXT: retq 1622 %1 = mul <2 x i64> %a0, <i64 0, i64 -1> 1623 ret <2 x i64> %1 1624} 1625 1626define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { 1627; X86-SSE2-LABEL: mul_v2i64_15_neg_63: 1628; X86-SSE2: # %bb.0: 1629; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] 1630; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1631; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 1632; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1633; X86-SSE2-NEXT: psrlq $32, %xmm3 1634; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 1635; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1636; X86-SSE2-NEXT: paddq %xmm3, %xmm0 1637; X86-SSE2-NEXT: psllq $32, %xmm0 1638; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1639; X86-SSE2-NEXT: retl 1640; 1641; X86-SSE4-LABEL: mul_v2i64_15_neg_63: 1642; X86-SSE4: # %bb.0: 1643; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] 1644; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 1645; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 1646; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 1647; X86-SSE4-NEXT: psrlq $32, %xmm3 1648; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 1649; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1650; X86-SSE4-NEXT: paddq %xmm3, %xmm0 1651; X86-SSE4-NEXT: psllq $32, %xmm0 1652; X86-SSE4-NEXT: paddq %xmm2, %xmm0 1653; X86-SSE4-NEXT: retl 1654; 1655; X64-SSE2-LABEL: mul_v2i64_15_neg_63: 1656; X64-SSE2: # %bb.0: 1657; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] 1658; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1659; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1660; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 1661; X64-SSE2-NEXT: psrlq $32, %xmm3 1662; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 1663; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1664; X64-SSE2-NEXT: paddq %xmm3, %xmm0 1665; X64-SSE2-NEXT: psllq $32, %xmm0 1666; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1667; X64-SSE2-NEXT: retq 1668; 1669; X64-SSE4-LABEL: mul_v2i64_15_neg_63: 1670; X64-SSE4: # %bb.0: 1671; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] 1672; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 1673; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 1674; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 1675; X64-SSE4-NEXT: psrlq $32, %xmm3 1676; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 1677; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1678; X64-SSE4-NEXT: paddq %xmm3, %xmm0 1679; X64-SSE4-NEXT: psllq $32, %xmm0 1680; X64-SSE4-NEXT: paddq %xmm2, %xmm0 1681; X64-SSE4-NEXT: retq 1682; 1683; X64-XOP-LABEL: mul_v2i64_15_neg_63: 1684; X64-XOP: # %bb.0: 1685; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] 1686; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1687; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 1688; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1689; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1690; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1691; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1692; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1693; X64-XOP-NEXT: retq 1694; 1695; X64-AVX2-LABEL: mul_v2i64_15_neg_63: 1696; X64-AVX2: # %bb.0: 1697; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] 1698; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1699; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 1700; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 1701; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1702; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1703; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1704; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1705; X64-AVX2-NEXT: retq 1706; 1707; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63: 1708; X64-AVX512DQ: # %bb.0: 1709; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1710; X64-AVX512DQ-NEXT: retq 1711 %1 = mul <2 x i64> %a0, <i64 15, i64 -63> 1712 ret <2 x i64> %1 1713} 1714 1715define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind { 1716; X86-SSE2-LABEL: mul_v4i32_0_15_31_7: 1717; X86-SSE2: # %bb.0: 1718; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1719; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1720; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1721; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1722; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1723; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1724; X86-SSE2-NEXT: retl 1725; 1726; X86-SSE4-LABEL: mul_v4i32_0_15_31_7: 1727; X86-SSE4: # %bb.0: 1728; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1729; X86-SSE4-NEXT: retl 1730; 1731; X64-SSE2-LABEL: mul_v4i32_0_15_31_7: 1732; X64-SSE2: # %bb.0: 1733; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1734; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1735; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1736; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1737; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1738; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1739; X64-SSE2-NEXT: retq 1740; 1741; X64-SSE4-LABEL: mul_v4i32_0_15_31_7: 1742; X64-SSE4: # %bb.0: 1743; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1744; X64-SSE4-NEXT: retq 1745; 1746; X64-AVX-LABEL: mul_v4i32_0_15_31_7: 1747; X64-AVX: # %bb.0: 1748; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1749; X64-AVX-NEXT: retq 1750 %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7> 1751 ret <4 x i32> %1 1752} 1753 1754define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind { 1755; X86-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255: 1756; X86-SSE: # %bb.0: 1757; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,7,15,31,63,127,255] 1758; X86-SSE-NEXT: retl 1759; 1760; X64-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255: 1761; X64-SSE: # %bb.0: 1762; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,7,15,31,63,127,255] 1763; X64-SSE-NEXT: retq 1764; 1765; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255: 1766; X64-AVX: # %bb.0: 1767; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,7,15,31,63,127,255] 1768; X64-AVX-NEXT: retq 1769 %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255> 1770 ret <8 x i16> %1 1771} 1772 1773define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind { 1774; SSE2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1775; SSE2: # %bb.0: 1776; SSE2-NEXT: movdqa %xmm0, %xmm1 1777; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1778; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] 1779; SSE2-NEXT: pmullw %xmm2, %xmm1 1780; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1781; SSE2-NEXT: pand %xmm3, %xmm1 1782; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1783; SSE2-NEXT: pmullw %xmm2, %xmm0 1784; SSE2-NEXT: pand %xmm3, %xmm0 1785; SSE2-NEXT: packuswb %xmm1, %xmm0 1786; SSE2-NEXT: ret{{[l|q]}} 1787; 1788; X86-SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1789; X86-SSE4: # %bb.0: 1790; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 1791; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] 1792; X86-SSE4-NEXT: psllw $8, %xmm1 1793; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] 1794; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1795; X86-SSE4-NEXT: por %xmm1, %xmm0 1796; X86-SSE4-NEXT: retl 1797; 1798; X64-SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1799; X64-SSE4: # %bb.0: 1800; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 1801; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] 1802; X64-SSE4-NEXT: psllw $8, %xmm1 1803; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] 1804; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1805; X64-SSE4-NEXT: por %xmm1, %xmm0 1806; X64-SSE4-NEXT: retq 1807; 1808; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1809; X64-XOP: # %bb.0: 1810; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] 1811; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] 1812; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] 1813; X64-XOP-NEXT: retq 1814; 1815; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1816; X64-AVX2: # %bb.0: 1817; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1818; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] 1819; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1820; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1821; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1822; X64-AVX2-NEXT: vzeroupper 1823; X64-AVX2-NEXT: retq 1824; 1825; X64-AVX512DQ-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: 1826; X64-AVX512DQ: # %bb.0: 1827; X64-AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1828; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] 1829; X64-AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1830; X64-AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1831; X64-AVX512DQ-NEXT: vzeroupper 1832; X64-AVX512DQ-NEXT: retq 1833 %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127> 1834 ret <16 x i8> %1 1835} 1836 1837define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { 1838; X86-SSE2-LABEL: mul_v2i64_68_132: 1839; X86-SSE2: # %bb.0: 1840; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] 1841; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1842; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 1843; X86-SSE2-NEXT: psrlq $32, %xmm0 1844; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1845; X86-SSE2-NEXT: psllq $32, %xmm0 1846; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1847; X86-SSE2-NEXT: retl 1848; 1849; SSE4-LABEL: mul_v2i64_68_132: 1850; SSE4: # %bb.0: 1851; SSE4-NEXT: pmovzxbq {{.*#+}} xmm1 = [68,132] 1852; SSE4-NEXT: movdqa %xmm0, %xmm2 1853; SSE4-NEXT: pmuludq %xmm1, %xmm2 1854; SSE4-NEXT: psrlq $32, %xmm0 1855; SSE4-NEXT: pmuludq %xmm1, %xmm0 1856; SSE4-NEXT: psllq $32, %xmm0 1857; SSE4-NEXT: paddq %xmm2, %xmm0 1858; SSE4-NEXT: ret{{[l|q]}} 1859; 1860; X64-SSE2-LABEL: mul_v2i64_68_132: 1861; X64-SSE2: # %bb.0: 1862; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,132] 1863; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1864; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1865; X64-SSE2-NEXT: psrlq $32, %xmm0 1866; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 1867; X64-SSE2-NEXT: psllq $32, %xmm0 1868; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1869; X64-SSE2-NEXT: retq 1870; 1871; X64-XOP-LABEL: mul_v2i64_68_132: 1872; X64-XOP: # %bb.0: 1873; X64-XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = [68,132] 1874; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1875; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 1876; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1877; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1878; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1879; X64-XOP-NEXT: retq 1880; 1881; X64-AVX2-LABEL: mul_v2i64_68_132: 1882; X64-AVX2: # %bb.0: 1883; X64-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [68,132] 1884; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1885; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 1886; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1887; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1888; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1889; X64-AVX2-NEXT: retq 1890; 1891; X64-AVX512DQ-LABEL: mul_v2i64_68_132: 1892; X64-AVX512DQ: # %bb.0: 1893; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1894; X64-AVX512DQ-NEXT: retq 1895 %mul = mul <2 x i64> %x, <i64 68, i64 132> 1896 ret <2 x i64> %mul 1897} 1898 1899define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { 1900; X86-SSE2-LABEL: mul_v2i64_60_120: 1901; X86-SSE2: # %bb.0: 1902; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] 1903; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1904; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 1905; X86-SSE2-NEXT: psrlq $32, %xmm0 1906; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1907; X86-SSE2-NEXT: psllq $32, %xmm0 1908; X86-SSE2-NEXT: paddq %xmm2, %xmm0 1909; X86-SSE2-NEXT: retl 1910; 1911; SSE4-LABEL: mul_v2i64_60_120: 1912; SSE4: # %bb.0: 1913; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [60,124] 1914; SSE4-NEXT: movdqa %xmm0, %xmm2 1915; SSE4-NEXT: pmuludq %xmm1, %xmm2 1916; SSE4-NEXT: psrlq $32, %xmm0 1917; SSE4-NEXT: pmuludq %xmm1, %xmm0 1918; SSE4-NEXT: psllq $32, %xmm0 1919; SSE4-NEXT: paddq %xmm2, %xmm0 1920; SSE4-NEXT: ret{{[l|q]}} 1921; 1922; X64-SSE2-LABEL: mul_v2i64_60_120: 1923; X64-SSE2: # %bb.0: 1924; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,124] 1925; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 1926; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 1927; X64-SSE2-NEXT: psrlq $32, %xmm0 1928; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 1929; X64-SSE2-NEXT: psllq $32, %xmm0 1930; X64-SSE2-NEXT: paddq %xmm2, %xmm0 1931; X64-SSE2-NEXT: retq 1932; 1933; X64-XOP-LABEL: mul_v2i64_60_120: 1934; X64-XOP: # %bb.0: 1935; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] 1936; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1937; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 1938; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1939; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 1940; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1941; X64-XOP-NEXT: retq 1942; 1943; X64-AVX2-LABEL: mul_v2i64_60_120: 1944; X64-AVX2: # %bb.0: 1945; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] 1946; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1947; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 1948; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1949; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 1950; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 1951; X64-AVX2-NEXT: retq 1952; 1953; X64-AVX512DQ-LABEL: mul_v2i64_60_120: 1954; X64-AVX512DQ: # %bb.0: 1955; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1956; X64-AVX512DQ-NEXT: retq 1957 %mul = mul <2 x i64> %x, <i64 60, i64 124> 1958 ret <2 x i64> %mul 1959} 1960 1961; We unfortunately can't see the zext that lives in the other basic block so we 1962; don't know that we only need one pmuludq to compute the full 64 bits. This 1963; sort of issue is more likely to occur when there is a loop and one of the 1964; multiply inputs is loop invariant. 1965define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) { 1966; X86-SSE2-LABEL: mul_v2i64_zext_cross_bb: 1967; X86-SSE2: # %bb.0: 1968; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 1969; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 1970; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1971; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1972; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1973; X86-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1974; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] 1975; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1976; X86-SSE2-NEXT: retl 1977; 1978; X86-SSE4-LABEL: mul_v2i64_zext_cross_bb: 1979; X86-SSE4: # %bb.0: 1980; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax 1981; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx 1982; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 1983; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 1984; X86-SSE4-NEXT: pmuludq %xmm1, %xmm0 1985; X86-SSE4-NEXT: retl 1986; 1987; X64-SSE2-LABEL: mul_v2i64_zext_cross_bb: 1988; X64-SSE2: # %bb.0: 1989; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1990; X64-SSE2-NEXT: pxor %xmm1, %xmm1 1991; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1992; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1993; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 1994; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 1995; X64-SSE2-NEXT: retq 1996; 1997; X64-SSE4-LABEL: mul_v2i64_zext_cross_bb: 1998; X64-SSE4: # %bb.0: 1999; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2000; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2001; X64-SSE4-NEXT: pmuludq %xmm1, %xmm0 2002; X64-SSE4-NEXT: retq 2003; 2004; X64-AVX-LABEL: mul_v2i64_zext_cross_bb: 2005; X64-AVX: # %bb.0: 2006; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2007; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2008; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 2009; X64-AVX-NEXT: retq 2010 %a = load <2 x i32>, ptr %in 2011 %b = zext <2 x i32> %a to <2 x i64> 2012 br label %foo 2013 2014foo: 2015 %c = load <2 x i32>, ptr %y 2016 %d = zext <2 x i32> %c to <2 x i64> 2017 %e = mul <2 x i64> %b, %d 2018 ret <2 x i64> %e 2019} 2020 2021define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) { 2022; X86-SSE2-LABEL: mul_v4i64_zext_cross_bb: 2023; X86-SSE2: # %bb.0: 2024; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 2025; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 2026; X86-SSE2-NEXT: movdqa (%ecx), %xmm0 2027; X86-SSE2-NEXT: pxor %xmm2, %xmm2 2028; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2029; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 2030; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2031; X86-SSE2-NEXT: movdqa (%eax), %xmm2 2032; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] 2033; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 2034; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 2035; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 2036; X86-SSE2-NEXT: retl 2037; 2038; X86-SSE4-LABEL: mul_v4i64_zext_cross_bb: 2039; X86-SSE4: # %bb.0: 2040; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax 2041; X86-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx 2042; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2043; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2044; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero 2045; X86-SSE4-NEXT: pmuludq %xmm2, %xmm1 2046; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero 2047; X86-SSE4-NEXT: pmuludq %xmm2, %xmm0 2048; X86-SSE4-NEXT: retl 2049; 2050; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb: 2051; X64-SSE2: # %bb.0: 2052; X64-SSE2-NEXT: movdqa (%rdi), %xmm0 2053; X64-SSE2-NEXT: pxor %xmm2, %xmm2 2054; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 2055; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 2056; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2057; X64-SSE2-NEXT: movdqa (%rsi), %xmm2 2058; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] 2059; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 2060; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 2061; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 2062; X64-SSE2-NEXT: retq 2063; 2064; X64-SSE4-LABEL: mul_v4i64_zext_cross_bb: 2065; X64-SSE4: # %bb.0: 2066; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2067; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2068; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero 2069; X64-SSE4-NEXT: pmuludq %xmm2, %xmm1 2070; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero 2071; X64-SSE4-NEXT: pmuludq %xmm2, %xmm0 2072; X64-SSE4-NEXT: retq 2073; 2074; X64-XOP-LABEL: mul_v4i64_zext_cross_bb: 2075; X64-XOP: # %bb.0: 2076; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2077; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2078; X64-XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2079; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero 2080; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero 2081; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 2082; X64-XOP-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2083; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 2084; X64-XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2085; X64-XOP-NEXT: retq 2086; 2087; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb: 2088; X64-AVX2: # %bb.0: 2089; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2090; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2091; X64-AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2092; X64-AVX2-NEXT: retq 2093; 2094; X64-AVX512DQ-LABEL: mul_v4i64_zext_cross_bb: 2095; X64-AVX512DQ: # %bb.0: 2096; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2097; X64-AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2098; X64-AVX512DQ-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2099; X64-AVX512DQ-NEXT: retq 2100 %a = load <4 x i32>, ptr %in 2101 %b = zext <4 x i32> %a to <4 x i64> 2102 br label %foo 2103 2104foo: 2105 %c = load <4 x i32>, ptr %y 2106 %d = zext <4 x i32> %c to <4 x i64> 2107 %e = mul <4 x i64> %b, %d 2108 ret <4 x i64> %e 2109} 2110;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2111; X64-SSE4-FAST: {{.*}} 2112; X64-SSE4-SLOW: {{.*}} 2113