1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { 9; SSE2-LABEL: mul_v16i8c: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movdqa %xmm0, %xmm1 12; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 14; SSE2-NEXT: pmullw %xmm2, %xmm1 15; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 16; SSE2-NEXT: pand %xmm3, %xmm1 17; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 18; SSE2-NEXT: pmullw %xmm2, %xmm0 19; SSE2-NEXT: pand %xmm3, %xmm0 20; SSE2-NEXT: packuswb %xmm1, %xmm0 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: mul_v16i8c: 24; SSE41: # %bb.0: # %entry 25; SSE41-NEXT: movdqa %xmm0, %xmm1 26; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 27; SSE41-NEXT: psllw $8, %xmm1 28; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 29; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 30; SSE41-NEXT: por %xmm1, %xmm0 31; SSE41-NEXT: retq 32; 33; AVX2-LABEL: mul_v16i8c: 34; AVX2: # %bb.0: # %entry 35; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 36; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 37; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 38; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 39; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 40; AVX2-NEXT: vzeroupper 41; AVX2-NEXT: retq 42; 43; AVX512F-LABEL: mul_v16i8c: 44; AVX512F: # %bb.0: # %entry 45; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 46; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 47; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 48; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 49; AVX512F-NEXT: vzeroupper 50; AVX512F-NEXT: retq 51; 52; AVX512BW-LABEL: mul_v16i8c: 53; AVX512BW: # %bb.0: # %entry 54; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 55; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 56; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 57; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 58; AVX512BW-NEXT: vzeroupper 59; AVX512BW-NEXT: retq 60entry: 61 %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 62 ret <16 x i8> %A 63} 64 65define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind { 66; SSE-LABEL: mul_v8i16c: 67; SSE: # %bb.0: # %entry 68; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117] 69; SSE-NEXT: retq 70; 71; AVX-LABEL: mul_v8i16c: 72; AVX: # %bb.0: # %entry 73; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [117,117,117,117,117,117,117,117] 74; AVX-NEXT: retq 75entry: 76 %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 77 ret <8 x i16> %A 78} 79 80define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { 81; SSE2-LABEL: mul_v4i32c: 82; SSE2: # %bb.0: # %entry 83; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] 84; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 85; SSE2-NEXT: pmuludq %xmm1, %xmm0 86; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 87; SSE2-NEXT: pmuludq %xmm1, %xmm2 88; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 89; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 90; SSE2-NEXT: retq 91; 92; SSE41-LABEL: mul_v4i32c: 93; SSE41: # %bb.0: # %entry 94; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 95; SSE41-NEXT: retq 96; 97; AVX-LABEL: mul_v4i32c: 98; AVX: # %bb.0: # %entry 99; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] 100; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 101; AVX-NEXT: retq 102entry: 103 %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > 104 ret <4 x i32> %A 105} 106 107define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { 108; SSE2-LABEL: mul_v2i64c: 109; SSE2: # %bb.0: # %entry 110; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117] 111; SSE2-NEXT: movdqa %xmm0, %xmm2 112; SSE2-NEXT: pmuludq %xmm1, %xmm2 113; SSE2-NEXT: psrlq $32, %xmm0 114; SSE2-NEXT: pmuludq %xmm1, %xmm0 115; SSE2-NEXT: psllq $32, %xmm0 116; SSE2-NEXT: paddq %xmm2, %xmm0 117; SSE2-NEXT: retq 118; 119; SSE41-LABEL: mul_v2i64c: 120; SSE41: # %bb.0: # %entry 121; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [117,117] 122; SSE41-NEXT: movdqa %xmm0, %xmm2 123; SSE41-NEXT: pmuludq %xmm1, %xmm2 124; SSE41-NEXT: psrlq $32, %xmm0 125; SSE41-NEXT: pmuludq %xmm1, %xmm0 126; SSE41-NEXT: psllq $32, %xmm0 127; SSE41-NEXT: paddq %xmm2, %xmm0 128; SSE41-NEXT: retq 129; 130; AVX-LABEL: mul_v2i64c: 131; AVX: # %bb.0: # %entry 132; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [117,117] 133; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 134; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 135; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 136; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 137; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 138; AVX-NEXT: retq 139entry: 140 %A = mul <2 x i64> %i, < i64 117, i64 117 > 141 ret <2 x i64> %A 142} 143 144define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { 145; SSE2-LABEL: mul_v16i8: 146; SSE2: # %bb.0: # %entry 147; SSE2-NEXT: movdqa %xmm1, %xmm2 148; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 149; SSE2-NEXT: movdqa %xmm0, %xmm3 150; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 151; SSE2-NEXT: pmullw %xmm2, %xmm3 152; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 153; SSE2-NEXT: pand %xmm2, %xmm3 154; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 155; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 156; SSE2-NEXT: pmullw %xmm1, %xmm0 157; SSE2-NEXT: pand %xmm2, %xmm0 158; SSE2-NEXT: packuswb %xmm3, %xmm0 159; SSE2-NEXT: retq 160; 161; SSE41-LABEL: mul_v16i8: 162; SSE41: # %bb.0: # %entry 163; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 164; SSE41-NEXT: movdqa %xmm1, %xmm3 165; SSE41-NEXT: pand %xmm2, %xmm3 166; SSE41-NEXT: movdqa %xmm0, %xmm4 167; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 168; SSE41-NEXT: pand %xmm2, %xmm4 169; SSE41-NEXT: pandn %xmm1, %xmm2 170; SSE41-NEXT: pmaddubsw %xmm2, %xmm0 171; SSE41-NEXT: psllw $8, %xmm0 172; SSE41-NEXT: por %xmm4, %xmm0 173; SSE41-NEXT: retq 174; 175; AVX2-LABEL: mul_v16i8: 176; AVX2: # %bb.0: # %entry 177; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 178; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 179; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 180; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 181; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 182; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 183; AVX2-NEXT: vzeroupper 184; AVX2-NEXT: retq 185; 186; AVX512F-LABEL: mul_v16i8: 187; AVX512F: # %bb.0: # %entry 188; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 189; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 190; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 191; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 192; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 193; AVX512F-NEXT: vzeroupper 194; AVX512F-NEXT: retq 195; 196; AVX512BW-LABEL: mul_v16i8: 197; AVX512BW: # %bb.0: # %entry 198; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 199; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 200; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 201; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 202; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 203; AVX512BW-NEXT: vzeroupper 204; AVX512BW-NEXT: retq 205entry: 206 %A = mul <16 x i8> %i, %j 207 ret <16 x i8> %A 208} 209 210define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind { 211; SSE-LABEL: mul_v8i16: 212; SSE: # %bb.0: # %entry 213; SSE-NEXT: pmullw %xmm1, %xmm0 214; SSE-NEXT: retq 215; 216; AVX-LABEL: mul_v8i16: 217; AVX: # %bb.0: # %entry 218; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 219; AVX-NEXT: retq 220entry: 221 %A = mul <8 x i16> %i, %j 222 ret <8 x i16> %A 223} 224 225define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind { 226; SSE2-LABEL: mul_v4i32: 227; SSE2: # %bb.0: # %entry 228; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 229; SSE2-NEXT: pmuludq %xmm1, %xmm0 230; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 231; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 232; SSE2-NEXT: pmuludq %xmm2, %xmm1 233; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 234; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 235; SSE2-NEXT: retq 236; 237; SSE41-LABEL: mul_v4i32: 238; SSE41: # %bb.0: # %entry 239; SSE41-NEXT: pmulld %xmm1, %xmm0 240; SSE41-NEXT: retq 241; 242; AVX-LABEL: mul_v4i32: 243; AVX: # %bb.0: # %entry 244; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 245; AVX-NEXT: retq 246entry: 247 %A = mul <4 x i32> %i, %j 248 ret <4 x i32> %A 249} 250 251define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind { 252; SSE-LABEL: mul_v2i64: 253; SSE: # %bb.0: # %entry 254; SSE-NEXT: movdqa %xmm0, %xmm2 255; SSE-NEXT: psrlq $32, %xmm2 256; SSE-NEXT: pmuludq %xmm1, %xmm2 257; SSE-NEXT: movdqa %xmm1, %xmm3 258; SSE-NEXT: psrlq $32, %xmm3 259; SSE-NEXT: pmuludq %xmm0, %xmm3 260; SSE-NEXT: paddq %xmm2, %xmm3 261; SSE-NEXT: psllq $32, %xmm3 262; SSE-NEXT: pmuludq %xmm1, %xmm0 263; SSE-NEXT: paddq %xmm3, %xmm0 264; SSE-NEXT: retq 265; 266; AVX-LABEL: mul_v2i64: 267; AVX: # %bb.0: # %entry 268; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 269; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 270; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 271; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 272; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 273; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 274; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 275; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 276; AVX-NEXT: retq 277entry: 278 %A = mul <2 x i64> %i, %j 279 ret <2 x i64> %A 280} 281 282declare void @foo() 283 284define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind { 285; SSE2-LABEL: mul_v4i32spill: 286; SSE2: # %bb.0: # %entry 287; SSE2-NEXT: subq $40, %rsp 288; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 289; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 290; SSE2-NEXT: callq foo@PLT 291; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 292; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 293; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 294; SSE2-NEXT: pmuludq %xmm2, %xmm0 295; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 296; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 297; SSE2-NEXT: pmuludq %xmm1, %xmm2 298; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 299; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 300; SSE2-NEXT: addq $40, %rsp 301; SSE2-NEXT: retq 302; 303; SSE41-LABEL: mul_v4i32spill: 304; SSE41: # %bb.0: # %entry 305; SSE41-NEXT: subq $40, %rsp 306; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 307; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 308; SSE41-NEXT: callq foo@PLT 309; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 310; SSE41-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 311; SSE41-NEXT: addq $40, %rsp 312; SSE41-NEXT: retq 313; 314; AVX-LABEL: mul_v4i32spill: 315; AVX: # %bb.0: # %entry 316; AVX-NEXT: subq $40, %rsp 317; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 318; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 319; AVX-NEXT: callq foo@PLT 320; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 321; AVX-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 322; AVX-NEXT: addq $40, %rsp 323; AVX-NEXT: retq 324entry: 325 ; Use a call to force spills. 326 call void @foo() 327 %A = mul <4 x i32> %i, %j 328 ret <4 x i32> %A 329} 330 331define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { 332; SSE-LABEL: mul_v2i64spill: 333; SSE: # %bb.0: # %entry 334; SSE-NEXT: subq $40, %rsp 335; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 336; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 337; SSE-NEXT: callq foo@PLT 338; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 339; SSE-NEXT: movdqa %xmm0, %xmm2 340; SSE-NEXT: psrlq $32, %xmm2 341; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 342; SSE-NEXT: pmuludq %xmm3, %xmm2 343; SSE-NEXT: movdqa %xmm3, %xmm1 344; SSE-NEXT: psrlq $32, %xmm1 345; SSE-NEXT: pmuludq %xmm0, %xmm1 346; SSE-NEXT: paddq %xmm2, %xmm1 347; SSE-NEXT: psllq $32, %xmm1 348; SSE-NEXT: pmuludq %xmm3, %xmm0 349; SSE-NEXT: paddq %xmm1, %xmm0 350; SSE-NEXT: addq $40, %rsp 351; SSE-NEXT: retq 352; 353; AVX-LABEL: mul_v2i64spill: 354; AVX: # %bb.0: # %entry 355; AVX-NEXT: subq $40, %rsp 356; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 357; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 358; AVX-NEXT: callq foo@PLT 359; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 360; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 361; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 362; AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 363; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1 364; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 365; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 366; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 367; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm1 368; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 369; AVX-NEXT: addq $40, %rsp 370; AVX-NEXT: retq 371entry: 372 ; Use a call to force spills. 373 call void @foo() 374 %A = mul <2 x i64> %i, %j 375 ret <2 x i64> %A 376} 377 378define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { 379; SSE2-LABEL: mul_v32i8c: 380; SSE2: # %bb.0: # %entry 381; SSE2-NEXT: movdqa %xmm0, %xmm2 382; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 383; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117] 384; SSE2-NEXT: pmullw %xmm3, %xmm2 385; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 386; SSE2-NEXT: pand %xmm4, %xmm2 387; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 388; SSE2-NEXT: pmullw %xmm3, %xmm0 389; SSE2-NEXT: pand %xmm4, %xmm0 390; SSE2-NEXT: packuswb %xmm2, %xmm0 391; SSE2-NEXT: movdqa %xmm1, %xmm2 392; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 393; SSE2-NEXT: pmullw %xmm3, %xmm2 394; SSE2-NEXT: pand %xmm4, %xmm2 395; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 396; SSE2-NEXT: pmullw %xmm3, %xmm1 397; SSE2-NEXT: pand %xmm4, %xmm1 398; SSE2-NEXT: packuswb %xmm2, %xmm1 399; SSE2-NEXT: retq 400; 401; SSE41-LABEL: mul_v32i8c: 402; SSE41: # %bb.0: # %entry 403; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 404; SSE41-NEXT: movdqa %xmm0, %xmm3 405; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 406; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 407; SSE41-NEXT: pand %xmm4, %xmm3 408; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 409; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 410; SSE41-NEXT: psllw $8, %xmm0 411; SSE41-NEXT: por %xmm3, %xmm0 412; SSE41-NEXT: movdqa %xmm1, %xmm3 413; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 414; SSE41-NEXT: pand %xmm4, %xmm3 415; SSE41-NEXT: pmaddubsw %xmm5, %xmm1 416; SSE41-NEXT: psllw $8, %xmm1 417; SSE41-NEXT: por %xmm3, %xmm1 418; SSE41-NEXT: retq 419; 420; AVX2-LABEL: mul_v32i8c: 421; AVX2: # %bb.0: # %entry 422; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 423; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 424; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 425; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 426; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 427; AVX2-NEXT: retq 428; 429; AVX512F-LABEL: mul_v32i8c: 430; AVX512F: # %bb.0: # %entry 431; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 432; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 433; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 434; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 435; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 436; AVX512F-NEXT: retq 437; 438; AVX512BW-LABEL: mul_v32i8c: 439; AVX512BW: # %bb.0: # %entry 440; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 441; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 442; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 443; AVX512BW-NEXT: retq 444entry: 445 %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 446 ret <32 x i8> %A 447} 448 449define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { 450; SSE2-LABEL: mul_v16i16c: 451; SSE2: # %bb.0: # %entry 452; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 453; SSE2-NEXT: pmullw %xmm2, %xmm0 454; SSE2-NEXT: pmullw %xmm2, %xmm1 455; SSE2-NEXT: retq 456; 457; SSE41-LABEL: mul_v16i16c: 458; SSE41: # %bb.0: # %entry 459; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 460; SSE41-NEXT: pmullw %xmm2, %xmm0 461; SSE41-NEXT: pmullw %xmm2, %xmm1 462; SSE41-NEXT: retq 463; 464; AVX-LABEL: mul_v16i16c: 465; AVX: # %bb.0: # %entry 466; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 467; AVX-NEXT: retq 468entry: 469 %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 470 ret <16 x i16> %A 471} 472 473define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { 474; SSE2-LABEL: mul_v8i32c: 475; SSE2: # %bb.0: # %entry 476; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 477; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 478; SSE2-NEXT: pmuludq %xmm2, %xmm0 479; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 480; SSE2-NEXT: pmuludq %xmm2, %xmm3 481; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 482; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 483; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 484; SSE2-NEXT: pmuludq %xmm2, %xmm1 485; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 486; SSE2-NEXT: pmuludq %xmm2, %xmm3 487; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 488; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 489; SSE2-NEXT: retq 490; 491; SSE41-LABEL: mul_v8i32c: 492; SSE41: # %bb.0: # %entry 493; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [117,117,117,117] 494; SSE41-NEXT: pmulld %xmm2, %xmm0 495; SSE41-NEXT: pmulld %xmm2, %xmm1 496; SSE41-NEXT: retq 497; 498; AVX-LABEL: mul_v8i32c: 499; AVX: # %bb.0: # %entry 500; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] 501; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 502; AVX-NEXT: retq 503entry: 504 %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 > 505 ret <8 x i32> %A 506} 507 508define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { 509; SSE2-LABEL: mul_v4i64c: 510; SSE2: # %bb.0: # %entry 511; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117] 512; SSE2-NEXT: movdqa %xmm0, %xmm3 513; SSE2-NEXT: pmuludq %xmm2, %xmm3 514; SSE2-NEXT: psrlq $32, %xmm0 515; SSE2-NEXT: pmuludq %xmm2, %xmm0 516; SSE2-NEXT: psllq $32, %xmm0 517; SSE2-NEXT: paddq %xmm3, %xmm0 518; SSE2-NEXT: movdqa %xmm1, %xmm3 519; SSE2-NEXT: pmuludq %xmm2, %xmm3 520; SSE2-NEXT: psrlq $32, %xmm1 521; SSE2-NEXT: pmuludq %xmm2, %xmm1 522; SSE2-NEXT: psllq $32, %xmm1 523; SSE2-NEXT: paddq %xmm3, %xmm1 524; SSE2-NEXT: retq 525; 526; SSE41-LABEL: mul_v4i64c: 527; SSE41: # %bb.0: # %entry 528; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [117,117] 529; SSE41-NEXT: movdqa %xmm0, %xmm3 530; SSE41-NEXT: pmuludq %xmm2, %xmm3 531; SSE41-NEXT: psrlq $32, %xmm0 532; SSE41-NEXT: pmuludq %xmm2, %xmm0 533; SSE41-NEXT: psllq $32, %xmm0 534; SSE41-NEXT: paddq %xmm3, %xmm0 535; SSE41-NEXT: movdqa %xmm1, %xmm3 536; SSE41-NEXT: pmuludq %xmm2, %xmm3 537; SSE41-NEXT: psrlq $32, %xmm1 538; SSE41-NEXT: pmuludq %xmm2, %xmm1 539; SSE41-NEXT: psllq $32, %xmm1 540; SSE41-NEXT: paddq %xmm3, %xmm1 541; SSE41-NEXT: retq 542; 543; AVX-LABEL: mul_v4i64c: 544; AVX: # %bb.0: # %entry 545; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117] 546; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 547; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 548; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 549; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 550; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 551; AVX-NEXT: retq 552entry: 553 %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > 554 ret <4 x i64> %A 555} 556 557define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { 558; SSE2-LABEL: mul_v32i8: 559; SSE2: # %bb.0: # %entry 560; SSE2-NEXT: movdqa %xmm2, %xmm4 561; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 562; SSE2-NEXT: movdqa %xmm0, %xmm5 563; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 564; SSE2-NEXT: pmullw %xmm4, %xmm5 565; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 566; SSE2-NEXT: pand %xmm4, %xmm5 567; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 568; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 569; SSE2-NEXT: pmullw %xmm2, %xmm0 570; SSE2-NEXT: pand %xmm4, %xmm0 571; SSE2-NEXT: packuswb %xmm5, %xmm0 572; SSE2-NEXT: movdqa %xmm3, %xmm2 573; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 574; SSE2-NEXT: movdqa %xmm1, %xmm5 575; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 576; SSE2-NEXT: pmullw %xmm2, %xmm5 577; SSE2-NEXT: pand %xmm4, %xmm5 578; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 579; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 580; SSE2-NEXT: pmullw %xmm3, %xmm1 581; SSE2-NEXT: pand %xmm4, %xmm1 582; SSE2-NEXT: packuswb %xmm5, %xmm1 583; SSE2-NEXT: retq 584; 585; SSE41-LABEL: mul_v32i8: 586; SSE41: # %bb.0: # %entry 587; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 588; SSE41-NEXT: movdqa %xmm4, %xmm5 589; SSE41-NEXT: pandn %xmm2, %xmm5 590; SSE41-NEXT: pand %xmm4, %xmm2 591; SSE41-NEXT: movdqa %xmm0, %xmm6 592; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 593; SSE41-NEXT: pand %xmm4, %xmm6 594; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 595; SSE41-NEXT: psllw $8, %xmm0 596; SSE41-NEXT: por %xmm6, %xmm0 597; SSE41-NEXT: movdqa %xmm3, %xmm2 598; SSE41-NEXT: pand %xmm4, %xmm2 599; SSE41-NEXT: movdqa %xmm1, %xmm5 600; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 601; SSE41-NEXT: pand %xmm4, %xmm5 602; SSE41-NEXT: pandn %xmm3, %xmm4 603; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 604; SSE41-NEXT: psllw $8, %xmm1 605; SSE41-NEXT: por %xmm5, %xmm1 606; SSE41-NEXT: retq 607; 608; AVX2-LABEL: mul_v32i8: 609; AVX2: # %bb.0: # %entry 610; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 611; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 612; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 613; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 614; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 615; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 616; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 617; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 618; AVX2-NEXT: retq 619; 620; AVX512F-LABEL: mul_v32i8: 621; AVX512F: # %bb.0: # %entry 622; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 623; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 624; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 625; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 626; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 627; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 628; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 629; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 630; AVX512F-NEXT: retq 631; 632; AVX512BW-LABEL: mul_v32i8: 633; AVX512BW: # %bb.0: # %entry 634; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 635; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 636; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 637; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 638; AVX512BW-NEXT: retq 639entry: 640 %A = mul <32 x i8> %i, %j 641 ret <32 x i8> %A 642} 643 644define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind { 645; SSE-LABEL: mul_v16i16: 646; SSE: # %bb.0: # %entry 647; SSE-NEXT: pmullw %xmm2, %xmm0 648; SSE-NEXT: pmullw %xmm3, %xmm1 649; SSE-NEXT: retq 650; 651; AVX-LABEL: mul_v16i16: 652; AVX: # %bb.0: # %entry 653; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 654; AVX-NEXT: retq 655entry: 656 %A = mul <16 x i16> %i, %j 657 ret <16 x i16> %A 658} 659 660define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind { 661; SSE2-LABEL: mul_v8i32: 662; SSE2: # %bb.0: # %entry 663; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 664; SSE2-NEXT: pmuludq %xmm2, %xmm0 665; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 666; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 667; SSE2-NEXT: pmuludq %xmm4, %xmm2 668; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 669; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 670; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 671; SSE2-NEXT: pmuludq %xmm3, %xmm1 672; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 673; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 674; SSE2-NEXT: pmuludq %xmm2, %xmm3 675; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 676; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 677; SSE2-NEXT: retq 678; 679; SSE41-LABEL: mul_v8i32: 680; SSE41: # %bb.0: # %entry 681; SSE41-NEXT: pmulld %xmm2, %xmm0 682; SSE41-NEXT: pmulld %xmm3, %xmm1 683; SSE41-NEXT: retq 684; 685; AVX-LABEL: mul_v8i32: 686; AVX: # %bb.0: # %entry 687; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 688; AVX-NEXT: retq 689entry: 690 %A = mul <8 x i32> %i, %j 691 ret <8 x i32> %A 692} 693 694define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind { 695; SSE-LABEL: mul_v4i64: 696; SSE: # %bb.0: # %entry 697; SSE-NEXT: movdqa %xmm0, %xmm4 698; SSE-NEXT: psrlq $32, %xmm4 699; SSE-NEXT: pmuludq %xmm2, %xmm4 700; SSE-NEXT: movdqa %xmm2, %xmm5 701; SSE-NEXT: psrlq $32, %xmm5 702; SSE-NEXT: pmuludq %xmm0, %xmm5 703; SSE-NEXT: paddq %xmm4, %xmm5 704; SSE-NEXT: psllq $32, %xmm5 705; SSE-NEXT: pmuludq %xmm2, %xmm0 706; SSE-NEXT: paddq %xmm5, %xmm0 707; SSE-NEXT: movdqa %xmm1, %xmm2 708; SSE-NEXT: psrlq $32, %xmm2 709; SSE-NEXT: pmuludq %xmm3, %xmm2 710; SSE-NEXT: movdqa %xmm3, %xmm4 711; SSE-NEXT: psrlq $32, %xmm4 712; SSE-NEXT: pmuludq %xmm1, %xmm4 713; SSE-NEXT: paddq %xmm2, %xmm4 714; SSE-NEXT: psllq $32, %xmm4 715; SSE-NEXT: pmuludq %xmm3, %xmm1 716; SSE-NEXT: paddq %xmm4, %xmm1 717; SSE-NEXT: retq 718; 719; AVX-LABEL: mul_v4i64: 720; AVX: # %bb.0: # %entry 721; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 722; AVX-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 723; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3 724; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 725; AVX-NEXT: vpaddq %ymm2, %ymm3, %ymm2 726; AVX-NEXT: vpsllq $32, %ymm2, %ymm2 727; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 728; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 729; AVX-NEXT: retq 730entry: 731 %A = mul <4 x i64> %i, %j 732 ret <4 x i64> %A 733} 734 735define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { 736; SSE2-LABEL: mul_v64i8c: 737; SSE2: # %bb.0: # %entry 738; SSE2-NEXT: movdqa %xmm0, %xmm6 739; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 740; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] 741; SSE2-NEXT: pmullw %xmm4, %xmm6 742; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 743; SSE2-NEXT: pand %xmm5, %xmm6 744; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 745; SSE2-NEXT: pmullw %xmm4, %xmm0 746; SSE2-NEXT: pand %xmm5, %xmm0 747; SSE2-NEXT: packuswb %xmm6, %xmm0 748; SSE2-NEXT: movdqa %xmm1, %xmm6 749; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 750; SSE2-NEXT: pmullw %xmm4, %xmm6 751; SSE2-NEXT: pand %xmm5, %xmm6 752; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 753; SSE2-NEXT: pmullw %xmm4, %xmm1 754; SSE2-NEXT: pand %xmm5, %xmm1 755; SSE2-NEXT: packuswb %xmm6, %xmm1 756; SSE2-NEXT: movdqa %xmm2, %xmm6 757; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 758; SSE2-NEXT: pmullw %xmm4, %xmm6 759; SSE2-NEXT: pand %xmm5, %xmm6 760; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 761; SSE2-NEXT: pmullw %xmm4, %xmm2 762; SSE2-NEXT: pand %xmm5, %xmm2 763; SSE2-NEXT: packuswb %xmm6, %xmm2 764; SSE2-NEXT: movdqa %xmm3, %xmm6 765; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 766; SSE2-NEXT: pmullw %xmm4, %xmm6 767; SSE2-NEXT: pand %xmm5, %xmm6 768; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 769; SSE2-NEXT: pmullw %xmm4, %xmm3 770; SSE2-NEXT: pand %xmm5, %xmm3 771; SSE2-NEXT: packuswb %xmm6, %xmm3 772; SSE2-NEXT: retq 773; 774; SSE41-LABEL: mul_v64i8c: 775; SSE41: # %bb.0: # %entry 776; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] 777; SSE41-NEXT: movdqa %xmm0, %xmm6 778; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 779; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 780; SSE41-NEXT: pand %xmm5, %xmm6 781; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 782; SSE41-NEXT: pmaddubsw %xmm7, %xmm0 783; SSE41-NEXT: psllw $8, %xmm0 784; SSE41-NEXT: por %xmm6, %xmm0 785; SSE41-NEXT: movdqa %xmm1, %xmm6 786; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 787; SSE41-NEXT: pand %xmm5, %xmm6 788; SSE41-NEXT: pmaddubsw %xmm7, %xmm1 789; SSE41-NEXT: psllw $8, %xmm1 790; SSE41-NEXT: por %xmm6, %xmm1 791; SSE41-NEXT: movdqa %xmm2, %xmm6 792; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 793; SSE41-NEXT: pand %xmm5, %xmm6 794; SSE41-NEXT: pmaddubsw %xmm7, %xmm2 795; SSE41-NEXT: psllw $8, %xmm2 796; SSE41-NEXT: por %xmm6, %xmm2 797; SSE41-NEXT: movdqa %xmm3, %xmm6 798; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 799; SSE41-NEXT: pand %xmm5, %xmm6 800; SSE41-NEXT: pmaddubsw %xmm7, %xmm3 801; SSE41-NEXT: psllw $8, %xmm3 802; SSE41-NEXT: por %xmm6, %xmm3 803; SSE41-NEXT: retq 804; 805; AVX2-LABEL: mul_v64i8c: 806; AVX2: # %bb.0: # %entry 807; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 808; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3 809; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 810; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 811; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 812; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 813; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 814; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 815; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 816; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 817; AVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1 818; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 819; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 820; AVX2-NEXT: retq 821; 822; AVX512F-LABEL: mul_v64i8c: 823; AVX512F: # %bb.0: # %entry 824; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 825; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 826; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 827; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 828; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 829; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 830; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0 831; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 832; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 833; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 834; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 835; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) 836; AVX512F-NEXT: retq 837; 838; AVX512BW-LABEL: mul_v64i8c: 839; AVX512BW: # %bb.0: # %entry 840; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] 841; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] 842; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 843; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) 844; AVX512BW-NEXT: retq 845entry: 846 %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 847 ret <64 x i8> %A 848} 849 850define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { 851; SSE2-LABEL: mul_v64i8: 852; SSE2: # %bb.0: # %entry 853; SSE2-NEXT: movdqa %xmm4, %xmm8 854; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 855; SSE2-NEXT: movdqa %xmm0, %xmm9 856; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 857; SSE2-NEXT: pmullw %xmm8, %xmm9 858; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 859; SSE2-NEXT: pand %xmm8, %xmm9 860; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 861; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 862; SSE2-NEXT: pmullw %xmm4, %xmm0 863; SSE2-NEXT: pand %xmm8, %xmm0 864; SSE2-NEXT: packuswb %xmm9, %xmm0 865; SSE2-NEXT: movdqa %xmm5, %xmm4 866; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 867; SSE2-NEXT: movdqa %xmm1, %xmm9 868; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 869; SSE2-NEXT: pmullw %xmm4, %xmm9 870; SSE2-NEXT: pand %xmm8, %xmm9 871; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 872; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 873; SSE2-NEXT: pmullw %xmm5, %xmm1 874; SSE2-NEXT: pand %xmm8, %xmm1 875; SSE2-NEXT: packuswb %xmm9, %xmm1 876; SSE2-NEXT: movdqa %xmm6, %xmm4 877; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 878; SSE2-NEXT: movdqa %xmm2, %xmm5 879; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 880; SSE2-NEXT: pmullw %xmm4, %xmm5 881; SSE2-NEXT: pand %xmm8, %xmm5 882; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 883; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 884; SSE2-NEXT: pmullw %xmm6, %xmm2 885; SSE2-NEXT: pand %xmm8, %xmm2 886; SSE2-NEXT: packuswb %xmm5, %xmm2 887; SSE2-NEXT: movdqa %xmm7, %xmm4 888; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 889; SSE2-NEXT: movdqa %xmm3, %xmm5 890; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 891; SSE2-NEXT: pmullw %xmm4, %xmm5 892; SSE2-NEXT: pand %xmm8, %xmm5 893; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 894; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 895; SSE2-NEXT: pmullw %xmm7, %xmm3 896; SSE2-NEXT: pand %xmm8, %xmm3 897; SSE2-NEXT: packuswb %xmm5, %xmm3 898; SSE2-NEXT: retq 899; 900; SSE41-LABEL: mul_v64i8: 901; SSE41: # %bb.0: # %entry 902; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 903; SSE41-NEXT: movdqa %xmm8, %xmm9 904; SSE41-NEXT: pandn %xmm4, %xmm9 905; SSE41-NEXT: pand %xmm8, %xmm4 906; SSE41-NEXT: movdqa %xmm0, %xmm10 907; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 908; SSE41-NEXT: pand %xmm8, %xmm10 909; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 910; SSE41-NEXT: psllw $8, %xmm0 911; SSE41-NEXT: por %xmm10, %xmm0 912; SSE41-NEXT: movdqa %xmm8, %xmm4 913; SSE41-NEXT: pandn %xmm5, %xmm4 914; SSE41-NEXT: pand %xmm8, %xmm5 915; SSE41-NEXT: movdqa %xmm1, %xmm9 916; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 917; SSE41-NEXT: pand %xmm8, %xmm9 918; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 919; SSE41-NEXT: psllw $8, %xmm1 920; SSE41-NEXT: por %xmm9, %xmm1 921; SSE41-NEXT: movdqa %xmm8, %xmm4 922; SSE41-NEXT: pandn %xmm6, %xmm4 923; SSE41-NEXT: pand %xmm8, %xmm6 924; SSE41-NEXT: movdqa %xmm2, %xmm5 925; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 926; SSE41-NEXT: pand %xmm8, %xmm5 927; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 928; SSE41-NEXT: psllw $8, %xmm2 929; SSE41-NEXT: por %xmm5, %xmm2 930; SSE41-NEXT: movdqa %xmm7, %xmm4 931; SSE41-NEXT: pand %xmm8, %xmm4 932; SSE41-NEXT: movdqa %xmm3, %xmm5 933; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 934; SSE41-NEXT: pand %xmm8, %xmm5 935; SSE41-NEXT: pandn %xmm7, %xmm8 936; SSE41-NEXT: pmaddubsw %xmm8, %xmm3 937; SSE41-NEXT: psllw $8, %xmm3 938; SSE41-NEXT: por %xmm5, %xmm3 939; SSE41-NEXT: retq 940; 941; AVX2-LABEL: mul_v64i8: 942; AVX2: # %bb.0: # %entry 943; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 944; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 945; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 946; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 947; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 948; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 949; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 950; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 951; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 952; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 953; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 954; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 955; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 956; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 957; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 958; AVX2-NEXT: retq 959; 960; AVX512F-LABEL: mul_v64i8: 961; AVX512F: # %bb.0: # %entry 962; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 963; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 964; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 965; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 966; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 967; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 968; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 969; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 970; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 971; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 972; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 973; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 974; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 975; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 976; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 977; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) 978; AVX512F-NEXT: retq 979; 980; AVX512BW-LABEL: mul_v64i8: 981; AVX512BW: # %bb.0: # %entry 982; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 983; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 984; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 985; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 986; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 987; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 988; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) 989; AVX512BW-NEXT: retq 990entry: 991 %A = mul <64 x i8> %i, %j 992 ret <64 x i8> %A 993} 994 995; PR30845 996define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { 997; SSE2-LABEL: mul_v4i64_zero_upper: 998; SSE2: # %bb.0: # %entry 999; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 1000; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 1001; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 1002; SSE2-NEXT: pmuludq %xmm2, %xmm0 1003; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 1004; SSE2-NEXT: pmuludq %xmm3, %xmm1 1005; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1006; SSE2-NEXT: retq 1007; 1008; SSE41-LABEL: mul_v4i64_zero_upper: 1009; SSE41: # %bb.0: # %entry 1010; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 1011; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 1012; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 1013; SSE41-NEXT: pmuludq %xmm2, %xmm0 1014; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 1015; SSE41-NEXT: pmuludq %xmm3, %xmm1 1016; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1017; SSE41-NEXT: retq 1018; 1019; AVX-LABEL: mul_v4i64_zero_upper: 1020; AVX: # %bb.0: # %entry 1021; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1022; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1023; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1024; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1025; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1026; AVX-NEXT: vzeroupper 1027; AVX-NEXT: retq 1028entry: 1029 %val1a = zext <4 x i32> %val1 to <4 x i64> 1030 %val2a = zext <4 x i32> %val2 to <4 x i64> 1031 %res64 = mul <4 x i64> %val1a, %val2a 1032 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1033 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1034 ret <4 x i32> %res 1035} 1036 1037define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { 1038; SSE2-LABEL: mul_v4i64_zero_upper_left: 1039; SSE2: # %bb.0: # %entry 1040; SSE2-NEXT: pxor %xmm4, %xmm4 1041; SSE2-NEXT: movdqa %xmm0, %xmm3 1042; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1043; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1044; SSE2-NEXT: movdqa %xmm0, %xmm4 1045; SSE2-NEXT: pmuludq %xmm2, %xmm4 1046; SSE2-NEXT: psrlq $32, %xmm2 1047; SSE2-NEXT: pmuludq %xmm0, %xmm2 1048; SSE2-NEXT: psllq $32, %xmm2 1049; SSE2-NEXT: paddq %xmm4, %xmm2 1050; SSE2-NEXT: movdqa %xmm3, %xmm0 1051; SSE2-NEXT: pmuludq %xmm1, %xmm0 1052; SSE2-NEXT: psrlq $32, %xmm1 1053; SSE2-NEXT: pmuludq %xmm1, %xmm3 1054; SSE2-NEXT: psllq $32, %xmm3 1055; SSE2-NEXT: paddq %xmm0, %xmm3 1056; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1057; SSE2-NEXT: movaps %xmm3, %xmm0 1058; SSE2-NEXT: retq 1059; 1060; SSE41-LABEL: mul_v4i64_zero_upper_left: 1061; SSE41: # %bb.0: # %entry 1062; SSE41-NEXT: pxor %xmm4, %xmm4 1063; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 1064; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1065; SSE41-NEXT: movdqa %xmm0, %xmm4 1066; SSE41-NEXT: pmuludq %xmm2, %xmm4 1067; SSE41-NEXT: psrlq $32, %xmm2 1068; SSE41-NEXT: pmuludq %xmm0, %xmm2 1069; SSE41-NEXT: psllq $32, %xmm2 1070; SSE41-NEXT: paddq %xmm4, %xmm2 1071; SSE41-NEXT: movdqa %xmm3, %xmm0 1072; SSE41-NEXT: pmuludq %xmm1, %xmm0 1073; SSE41-NEXT: psrlq $32, %xmm1 1074; SSE41-NEXT: pmuludq %xmm1, %xmm3 1075; SSE41-NEXT: psllq $32, %xmm3 1076; SSE41-NEXT: paddq %xmm0, %xmm3 1077; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1078; SSE41-NEXT: movaps %xmm3, %xmm0 1079; SSE41-NEXT: retq 1080; 1081; AVX-LABEL: mul_v4i64_zero_upper_left: 1082; AVX: # %bb.0: # %entry 1083; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1084; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1085; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 1086; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1087; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 1088; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1089; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1090; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1091; AVX-NEXT: vzeroupper 1092; AVX-NEXT: retq 1093entry: 1094 %val1a = zext <4 x i32> %val1 to <4 x i64> 1095 %res64 = mul <4 x i64> %val1a, %val2 1096 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1097 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1098 ret <4 x i32> %res 1099} 1100 1101define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { 1102; SSE2-LABEL: mul_v4i64_zero_lower: 1103; SSE2: # %bb.0: # %entry 1104; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] 1105; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 1106; SSE2-NEXT: psrlq $32, %xmm2 1107; SSE2-NEXT: pmuludq %xmm0, %xmm2 1108; SSE2-NEXT: psrlq $32, %xmm1 1109; SSE2-NEXT: pmuludq %xmm1, %xmm3 1110; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] 1111; SSE2-NEXT: movaps %xmm3, %xmm0 1112; SSE2-NEXT: retq 1113; 1114; SSE41-LABEL: mul_v4i64_zero_lower: 1115; SSE41: # %bb.0: # %entry 1116; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 1117; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 1118; SSE41-NEXT: psrlq $32, %xmm2 1119; SSE41-NEXT: pmuludq %xmm0, %xmm2 1120; SSE41-NEXT: psrlq $32, %xmm1 1121; SSE41-NEXT: pmuludq %xmm1, %xmm3 1122; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] 1123; SSE41-NEXT: movaps %xmm3, %xmm0 1124; SSE41-NEXT: retq 1125; 1126; AVX-LABEL: mul_v4i64_zero_lower: 1127; AVX: # %bb.0: # %entry 1128; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1129; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 1130; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1131; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1132; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1133; AVX-NEXT: vzeroupper 1134; AVX-NEXT: retq 1135entry: 1136 %val1a = zext <4 x i32> %val1 to <4 x i64> 1137 %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296> 1138 %res64 = mul <4 x i64> %val1a, %val2a 1139 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1140 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1141 ret <4 x i32> %res 1142} 1143 1144define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { 1145; SSE2-LABEL: mul_v8i64_zero_upper: 1146; SSE2: # %bb.0: # %entry 1147; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] 1148; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] 1149; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] 1150; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] 1151; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] 1152; SSE2-NEXT: pmuludq %xmm4, %xmm0 1153; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] 1154; SSE2-NEXT: pmuludq %xmm5, %xmm1 1155; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1156; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] 1157; SSE2-NEXT: pmuludq %xmm6, %xmm1 1158; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] 1159; SSE2-NEXT: pmuludq %xmm7, %xmm2 1160; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1161; SSE2-NEXT: retq 1162; 1163; SSE41-LABEL: mul_v8i64_zero_upper: 1164; SSE41: # %bb.0: # %entry 1165; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero 1166; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] 1167; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero 1168; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] 1169; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero 1170; SSE41-NEXT: pmuludq %xmm4, %xmm0 1171; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] 1172; SSE41-NEXT: pmuludq %xmm5, %xmm1 1173; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1174; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero 1175; SSE41-NEXT: pmuludq %xmm6, %xmm1 1176; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] 1177; SSE41-NEXT: pmuludq %xmm7, %xmm2 1178; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1179; SSE41-NEXT: retq 1180; 1181; AVX2-LABEL: mul_v8i64_zero_upper: 1182; AVX2: # %bb.0: # %entry 1183; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1184; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1185; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1186; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1187; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1188; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1189; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1190; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1191; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] 1192; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1193; AVX2-NEXT: retq 1194; 1195; AVX512-LABEL: mul_v8i64_zero_upper: 1196; AVX512: # %bb.0: # %entry 1197; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 1198; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 1199; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 1200; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1201; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1202; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1203; AVX512-NEXT: retq 1204entry: 1205 %val1a = zext <8 x i32> %val1 to <8 x i64> 1206 %val2a = zext <8 x i32> %val2 to <8 x i64> 1207 %res64 = mul <8 x i64> %val1a, %val2a 1208 %rescast = bitcast <8 x i64> %res64 to <16 x i32> 1209 %res = shufflevector <16 x i32> %rescast, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 9, i32 11, i32 13, i32 15 > 1210 ret <8 x i32> %res 1211} 1212 1213define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { 1214; SSE2-LABEL: mul_v8i64_sext: 1215; SSE2: # %bb.0: 1216; SSE2-NEXT: movdqa %xmm1, %xmm4 1217; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 1218; SSE2-NEXT: psrad $16, %xmm6 1219; SSE2-NEXT: pxor %xmm12, %xmm12 1220; SSE2-NEXT: pxor %xmm7, %xmm7 1221; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 1222; SSE2-NEXT: movdqa %xmm6, %xmm5 1223; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] 1224; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 1225; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1226; SSE2-NEXT: psrad $16, %xmm0 1227; SSE2-NEXT: pxor %xmm11, %xmm11 1228; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 1229; SSE2-NEXT: movdqa %xmm0, %xmm9 1230; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] 1231; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 1232; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 1233; SSE2-NEXT: pxor %xmm8, %xmm8 1234; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 1235; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] 1236; SSE2-NEXT: pxor %xmm10, %xmm10 1237; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 1238; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] 1239; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1240; SSE2-NEXT: pxor %xmm13, %xmm13 1241; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 1242; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] 1243; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 1244; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] 1245; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3] 1246; SSE2-NEXT: pmuludq %xmm4, %xmm14 1247; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3] 1248; SSE2-NEXT: pmuludq %xmm0, %xmm12 1249; SSE2-NEXT: paddq %xmm14, %xmm12 1250; SSE2-NEXT: psllq $32, %xmm12 1251; SSE2-NEXT: pmuludq %xmm4, %xmm0 1252; SSE2-NEXT: paddq %xmm12, %xmm0 1253; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3] 1254; SSE2-NEXT: pmuludq %xmm1, %xmm4 1255; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3] 1256; SSE2-NEXT: pmuludq %xmm9, %xmm11 1257; SSE2-NEXT: paddq %xmm4, %xmm11 1258; SSE2-NEXT: psllq $32, %xmm11 1259; SSE2-NEXT: pmuludq %xmm9, %xmm1 1260; SSE2-NEXT: paddq %xmm11, %xmm1 1261; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] 1262; SSE2-NEXT: pmuludq %xmm2, %xmm4 1263; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3] 1264; SSE2-NEXT: pmuludq %xmm6, %xmm9 1265; SSE2-NEXT: paddq %xmm4, %xmm9 1266; SSE2-NEXT: psllq $32, %xmm9 1267; SSE2-NEXT: pmuludq %xmm6, %xmm2 1268; SSE2-NEXT: paddq %xmm9, %xmm2 1269; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] 1270; SSE2-NEXT: pmuludq %xmm3, %xmm4 1271; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3] 1272; SSE2-NEXT: pmuludq %xmm5, %xmm6 1273; SSE2-NEXT: paddq %xmm4, %xmm6 1274; SSE2-NEXT: psllq $32, %xmm6 1275; SSE2-NEXT: pmuludq %xmm5, %xmm3 1276; SSE2-NEXT: paddq %xmm6, %xmm3 1277; SSE2-NEXT: retq 1278; 1279; SSE41-LABEL: mul_v8i64_sext: 1280; SSE41: # %bb.0: 1281; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 1282; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 1283; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1284; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 1285; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 1286; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 1287; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 1288; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] 1289; SSE41-NEXT: pmuldq %xmm4, %xmm3 1290; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1291; SSE41-NEXT: pmuldq %xmm5, %xmm2 1292; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,3,3] 1293; SSE41-NEXT: pmuldq %xmm6, %xmm4 1294; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 1295; SSE41-NEXT: pmuldq %xmm7, %xmm0 1296; SSE41-NEXT: movdqa %xmm4, %xmm1 1297; SSE41-NEXT: retq 1298; 1299; AVX2-LABEL: mul_v8i64_sext: 1300; AVX2: # %bb.0: 1301; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1302; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 1303; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 1304; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1305; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1306; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 1307; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1308; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1309; AVX2-NEXT: vmovdqa %ymm2, %ymm1 1310; AVX2-NEXT: retq 1311; 1312; AVX512-LABEL: mul_v8i64_sext: 1313; AVX512: # %bb.0: 1314; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 1315; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 1316; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 1317; AVX512-NEXT: retq 1318 %1 = sext <8 x i16> %val1 to <8 x i64> 1319 %2 = sext <8 x i32> %val2 to <8 x i64> 1320 %3 = mul <8 x i64> %1, %2 1321 ret <8 x i64> %3 1322} 1323 1324define <2 x i64> @pmuldq_square(<2 x i64> %x) { 1325; SSE2-LABEL: pmuldq_square: 1326; SSE2: # %bb.0: 1327; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1328; SSE2-NEXT: psllq $32, %xmm0 1329; SSE2-NEXT: psrad $31, %xmm0 1330; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] 1331; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1332; SSE2-NEXT: psrlq $32, %xmm0 1333; SSE2-NEXT: pmuludq %xmm1, %xmm0 1334; SSE2-NEXT: pmuludq %xmm1, %xmm1 1335; SSE2-NEXT: psllq $33, %xmm0 1336; SSE2-NEXT: paddq %xmm1, %xmm0 1337; SSE2-NEXT: retq 1338; 1339; SSE41-LABEL: pmuldq_square: 1340; SSE41: # %bb.0: 1341; SSE41-NEXT: pmuldq %xmm0, %xmm0 1342; SSE41-NEXT: retq 1343; 1344; AVX-LABEL: pmuldq_square: 1345; AVX: # %bb.0: 1346; AVX-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 1347; AVX-NEXT: retq 1348 %1 = shl <2 x i64> %x, <i64 32, i64 32> 1349 %2 = ashr exact <2 x i64> %1, <i64 32, i64 32> 1350 %3 = mul nsw <2 x i64> %2, %2 1351 ret <2 x i64> %3 1352} 1353 1354define <2 x i64> @pmuludq_square(<2 x i64> %x) { 1355; SSE-LABEL: pmuludq_square: 1356; SSE: # %bb.0: 1357; SSE-NEXT: pmuludq %xmm0, %xmm0 1358; SSE-NEXT: retq 1359; 1360; AVX-LABEL: pmuludq_square: 1361; AVX: # %bb.0: 1362; AVX-NEXT: vpmuludq %xmm0, %xmm0, %xmm0 1363; AVX-NEXT: retq 1364 %1 = and <2 x i64> %x, <i64 4294967295, i64 4294967295> 1365 %2 = mul nuw <2 x i64> %1, %1 1366 ret <2 x i64> %2 1367} 1368