1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 7 8define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) { 9; SSE-LABEL: zext_mulhuw_v4i16: 10; SSE: # %bb.0: 11; SSE-NEXT: pmulhuw %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: zext_mulhuw_v4i16: 15; AVX: # %bb.0: 16; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %a1 = zext <4 x i16> %a to <4 x i32> 19 %b1 = zext <4 x i16> %b to <4 x i32> 20 %c = mul <4 x i32> %a1, %b1 21 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 22 %e = trunc <4 x i32> %d to <4 x i16> 23 ret <4 x i16> %e 24} 25 26define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { 27; SSE2-LABEL: and_mulhuw_v4i16: 28; SSE2: # %bb.0: 29; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 30; SSE2-NEXT: pslld $16, %xmm2 31; SSE2-NEXT: psrad $16, %xmm2 32; SSE2-NEXT: xorps %xmm3, %xmm3 33; SSE2-NEXT: packssdw %xmm3, %xmm2 34; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 35; SSE2-NEXT: pslld $16, %xmm0 36; SSE2-NEXT: psrad $16, %xmm0 37; SSE2-NEXT: packssdw %xmm3, %xmm0 38; SSE2-NEXT: pmulhuw %xmm2, %xmm0 39; SSE2-NEXT: retq 40; 41; SSE41-LABEL: and_mulhuw_v4i16: 42; SSE41: # %bb.0: 43; SSE41-NEXT: pxor %xmm4, %xmm4 44; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 45; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 46; SSE41-NEXT: packusdw %xmm3, %xmm2 47; SSE41-NEXT: packusdw %xmm4, %xmm2 48; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 49; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 50; SSE41-NEXT: packusdw %xmm1, %xmm0 51; SSE41-NEXT: packusdw %xmm4, %xmm0 52; SSE41-NEXT: pmulhuw %xmm2, %xmm0 53; SSE41-NEXT: retq 54; 55; AVX2-LABEL: and_mulhuw_v4i16: 56; AVX2: # %bb.0: 57; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 58; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 59; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 60; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 61; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 62; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 63; AVX2-NEXT: vzeroupper 64; AVX2-NEXT: retq 65; 66; AVX512-LABEL: and_mulhuw_v4i16: 67; AVX512: # %bb.0: 68; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 69; AVX512-NEXT: vpmovqw %zmm0, %xmm0 70; AVX512-NEXT: vzeroupper 71; AVX512-NEXT: retq 72 %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535> 73 %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535> 74 %c = mul <4 x i64> %a1, %b1 75 %d = lshr <4 x i64> %c, <i64 16, i64 16, i64 16, i64 16> 76 %e = trunc <4 x i64> %d to <4 x i16> 77 ret <4 x i16> %e 78} 79 80define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { 81; SSE-LABEL: sext_mulhw_v4i16: 82; SSE: # %bb.0: 83; SSE-NEXT: pmulhw %xmm1, %xmm0 84; SSE-NEXT: retq 85; 86; AVX-LABEL: sext_mulhw_v4i16: 87; AVX: # %bb.0: 88; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 89; AVX-NEXT: retq 90 %a1 = sext <4 x i16> %a to <4 x i32> 91 %b1 = sext <4 x i16> %b to <4 x i32> 92 %c = mul <4 x i32> %a1, %b1 93 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 94 %e = trunc <4 x i32> %d to <4 x i16> 95 ret <4 x i16> %e 96} 97 98define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) { 99; SSE2-LABEL: ashr_mulhw_v4i16: 100; SSE2: # %bb.0: 101; SSE2-NEXT: psrad $16, %xmm1 102; SSE2-NEXT: packssdw %xmm1, %xmm1 103; SSE2-NEXT: psrad $16, %xmm0 104; SSE2-NEXT: packssdw %xmm0, %xmm0 105; SSE2-NEXT: pmulhw %xmm1, %xmm0 106; SSE2-NEXT: retq 107; 108; SSE41-LABEL: ashr_mulhw_v4i16: 109; SSE41: # %bb.0: 110; SSE41-NEXT: psrld $16, %xmm0 111; SSE41-NEXT: psrld $16, %xmm1 112; SSE41-NEXT: packusdw %xmm1, %xmm1 113; SSE41-NEXT: packusdw %xmm0, %xmm0 114; SSE41-NEXT: pmulhw %xmm1, %xmm0 115; SSE41-NEXT: retq 116; 117; AVX-LABEL: ashr_mulhw_v4i16: 118; AVX: # %bb.0: 119; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 120; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 121; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 122; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 123; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 124; AVX-NEXT: retq 125 %a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16> 126 %b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16> 127 %c = mul <4 x i32> %a1, %b1 128 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 129 %e = trunc <4 x i32> %d to <4 x i16> 130 ret <4 x i16> %e 131} 132 133define <8 x i16> @zext_mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) { 134; SSE-LABEL: zext_mulhuw_v8i16: 135; SSE: # %bb.0: 136; SSE-NEXT: pmulhuw %xmm1, %xmm0 137; SSE-NEXT: retq 138; 139; AVX-LABEL: zext_mulhuw_v8i16: 140; AVX: # %bb.0: 141; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 142; AVX-NEXT: retq 143 %a1 = zext <8 x i16> %a to <8 x i32> 144 %b1 = zext <8 x i16> %b to <8 x i32> 145 %c = mul <8 x i32> %a1, %b1 146 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 147 %e = trunc <8 x i32> %d to <8 x i16> 148 ret <8 x i16> %e 149} 150 151define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) { 152; SSE2-LABEL: lshr_mulhuw_v8i16: 153; SSE2: # %bb.0: 154; SSE2-NEXT: psrad $16, %xmm3 155; SSE2-NEXT: psrad $16, %xmm2 156; SSE2-NEXT: packssdw %xmm3, %xmm2 157; SSE2-NEXT: psrad $16, %xmm1 158; SSE2-NEXT: psrad $16, %xmm0 159; SSE2-NEXT: packssdw %xmm1, %xmm0 160; SSE2-NEXT: pmulhuw %xmm2, %xmm0 161; SSE2-NEXT: retq 162; 163; SSE41-LABEL: lshr_mulhuw_v8i16: 164; SSE41: # %bb.0: 165; SSE41-NEXT: psrld $16, %xmm1 166; SSE41-NEXT: psrld $16, %xmm0 167; SSE41-NEXT: packusdw %xmm1, %xmm0 168; SSE41-NEXT: psrld $16, %xmm3 169; SSE41-NEXT: psrld $16, %xmm2 170; SSE41-NEXT: packusdw %xmm3, %xmm2 171; SSE41-NEXT: pmulhuw %xmm2, %xmm0 172; SSE41-NEXT: retq 173; 174; AVX2-LABEL: lshr_mulhuw_v8i16: 175; AVX2: # %bb.0: 176; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 177; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 178; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 179; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 180; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 181; AVX2-NEXT: vzeroupper 182; AVX2-NEXT: retq 183; 184; AVX512-LABEL: lshr_mulhuw_v8i16: 185; AVX512: # %bb.0: 186; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 187; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1 188; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 189; AVX512-NEXT: vpmovdw %zmm0, %ymm0 190; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 191; AVX512-NEXT: vzeroupper 192; AVX512-NEXT: retq 193 %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 194 %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 195 %c = mul <8 x i32> %a1, %b1 196 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 197 %e = trunc <8 x i32> %d to <8 x i16> 198 ret <8 x i16> %e 199} 200 201define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) { 202; SSE-LABEL: sext_mulhw_v8i16: 203; SSE: # %bb.0: 204; SSE-NEXT: pmulhw %xmm1, %xmm0 205; SSE-NEXT: retq 206; 207; AVX-LABEL: sext_mulhw_v8i16: 208; AVX: # %bb.0: 209; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 210; AVX-NEXT: retq 211 %a1 = sext <8 x i16> %a to <8 x i32> 212 %b1 = sext <8 x i16> %b to <8 x i32> 213 %c = mul <8 x i32> %a1, %b1 214 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 215 %e = trunc <8 x i32> %d to <8 x i16> 216 ret <8 x i16> %e 217} 218 219define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) { 220; SSE-LABEL: sextinreg_mulhw_v8i16: 221; SSE: # %bb.0: 222; SSE-NEXT: pslld $24, %xmm1 223; SSE-NEXT: psrad $24, %xmm1 224; SSE-NEXT: pslld $24, %xmm0 225; SSE-NEXT: psrad $24, %xmm0 226; SSE-NEXT: packssdw %xmm1, %xmm0 227; SSE-NEXT: pslld $25, %xmm3 228; SSE-NEXT: psrad $25, %xmm3 229; SSE-NEXT: pslld $25, %xmm2 230; SSE-NEXT: psrad $25, %xmm2 231; SSE-NEXT: packssdw %xmm3, %xmm2 232; SSE-NEXT: pmulhw %xmm2, %xmm0 233; SSE-NEXT: retq 234; 235; AVX2-LABEL: sextinreg_mulhw_v8i16: 236; AVX2: # %bb.0: 237; AVX2-NEXT: vpslld $24, %ymm0, %ymm0 238; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 239; AVX2-NEXT: vpslld $25, %ymm1, %ymm1 240; AVX2-NEXT: vpsrad $25, %ymm1, %ymm1 241; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 242; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 243; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 244; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 245; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 246; AVX2-NEXT: vzeroupper 247; AVX2-NEXT: retq 248; 249; AVX512-LABEL: sextinreg_mulhw_v8i16: 250; AVX512: # %bb.0: 251; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 252; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 253; AVX512-NEXT: vpmovdw %zmm1, %ymm1 254; AVX512-NEXT: vpsllw $9, %xmm1, %xmm1 255; AVX512-NEXT: vpsraw $9, %xmm1, %xmm1 256; AVX512-NEXT: vpmovdw %zmm0, %ymm0 257; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0 258; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0 259; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 260; AVX512-NEXT: vzeroupper 261; AVX512-NEXT: retq 262 %a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 263 %b1 = shl <8 x i32> %b, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25> 264 %a2 = ashr <8 x i32> %a1, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 265 %b2 = ashr <8 x i32> %b1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25> 266 %c = mul <8 x i32> %a2, %b2 267 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 268 %e = trunc <8 x i32> %d to <8 x i16> 269 ret <8 x i16> %e 270} 271 272define <8 x i16> @zext_mulhuw_v8i16_v8i33(<8 x i16> %a, <8 x i16> %b) { 273; SSE-LABEL: zext_mulhuw_v8i16_v8i33: 274; SSE: # %bb.0: 275; SSE-NEXT: pmulhuw %xmm1, %xmm0 276; SSE-NEXT: retq 277; 278; AVX-LABEL: zext_mulhuw_v8i16_v8i33: 279; AVX: # %bb.0: 280; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 281; AVX-NEXT: retq 282 %a1 = zext <8 x i16> %a to <8 x i33> 283 %b1 = zext <8 x i16> %b to <8 x i33> 284 %c = mul <8 x i33> %a1, %b1 285 %d = lshr <8 x i33> %c, <i33 16, i33 16, i33 16, i33 16, i33 16, i33 16, i33 16, i33 16> 286 %e = trunc <8 x i33> %d to <8 x i16> 287 ret <8 x i16> %e 288} 289 290define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) { 291; SSE-LABEL: zext_mulhuw_v16i16: 292; SSE: # %bb.0: 293; SSE-NEXT: pmulhuw %xmm2, %xmm0 294; SSE-NEXT: pmulhuw %xmm3, %xmm1 295; SSE-NEXT: retq 296; 297; AVX-LABEL: zext_mulhuw_v16i16: 298; AVX: # %bb.0: 299; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 300; AVX-NEXT: retq 301 %a1 = zext <16 x i16> %a to <16 x i32> 302 %b1 = zext <16 x i16> %b to <16 x i32> 303 %c = mul <16 x i32> %a1, %b1 304 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 305 %e = trunc <16 x i32> %d to <16 x i16> 306 ret <16 x i16> %e 307} 308 309define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { 310; SSE2-LABEL: and_mulhuw_v16i16: 311; SSE2: # %bb.0: 312; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] 313; SSE2-NEXT: pand %xmm8, %xmm3 314; SSE2-NEXT: pand %xmm8, %xmm2 315; SSE2-NEXT: packssdw %xmm3, %xmm2 316; SSE2-NEXT: pand %xmm8, %xmm1 317; SSE2-NEXT: pand %xmm8, %xmm0 318; SSE2-NEXT: packssdw %xmm1, %xmm0 319; SSE2-NEXT: pand %xmm8, %xmm7 320; SSE2-NEXT: pand %xmm8, %xmm6 321; SSE2-NEXT: packssdw %xmm7, %xmm6 322; SSE2-NEXT: pmulhw %xmm2, %xmm6 323; SSE2-NEXT: pand %xmm8, %xmm5 324; SSE2-NEXT: pand %xmm4, %xmm8 325; SSE2-NEXT: packssdw %xmm5, %xmm8 326; SSE2-NEXT: pmulhw %xmm8, %xmm0 327; SSE2-NEXT: movdqa %xmm6, %xmm1 328; SSE2-NEXT: retq 329; 330; SSE41-LABEL: and_mulhuw_v16i16: 331; SSE41: # %bb.0: 332; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767] 333; SSE41-NEXT: pand %xmm8, %xmm3 334; SSE41-NEXT: pand %xmm8, %xmm2 335; SSE41-NEXT: packusdw %xmm3, %xmm2 336; SSE41-NEXT: pand %xmm8, %xmm1 337; SSE41-NEXT: pand %xmm8, %xmm0 338; SSE41-NEXT: packusdw %xmm1, %xmm0 339; SSE41-NEXT: pand %xmm8, %xmm7 340; SSE41-NEXT: pand %xmm8, %xmm6 341; SSE41-NEXT: packusdw %xmm7, %xmm6 342; SSE41-NEXT: pmulhw %xmm2, %xmm6 343; SSE41-NEXT: pand %xmm8, %xmm5 344; SSE41-NEXT: pand %xmm4, %xmm8 345; SSE41-NEXT: packusdw %xmm5, %xmm8 346; SSE41-NEXT: pmulhw %xmm8, %xmm0 347; SSE41-NEXT: movdqa %xmm6, %xmm1 348; SSE41-NEXT: retq 349; 350; AVX2-LABEL: and_mulhuw_v16i16: 351; AVX2: # %bb.0: 352; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] 353; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 354; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 355; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 356; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 357; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 358; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 359; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 360; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 361; AVX2-NEXT: retq 362; 363; AVX512F-LABEL: and_mulhuw_v16i16: 364; AVX512F: # %bb.0: 365; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 366; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] 367; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 368; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 369; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 370; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 371; AVX512F-NEXT: retq 372; 373; AVX512BW-LABEL: and_mulhuw_v16i16: 374; AVX512BW: # %bb.0: 375; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] 376; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 377; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 378; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 379; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 380; AVX512BW-NEXT: retq 381 %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 382 %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 383 %c = mul <16 x i32> %a1, %b1 384 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 385 %e = trunc <16 x i32> %d to <16 x i16> 386 ret <16 x i16> %e 387} 388 389define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) { 390; SSE-LABEL: sext_mulhuw_v16i16: 391; SSE: # %bb.0: 392; SSE-NEXT: pmulhw %xmm2, %xmm0 393; SSE-NEXT: pmulhw %xmm3, %xmm1 394; SSE-NEXT: retq 395; 396; AVX-LABEL: sext_mulhuw_v16i16: 397; AVX: # %bb.0: 398; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 399; AVX-NEXT: retq 400 %a1 = sext <16 x i16> %a to <16 x i32> 401 %b1 = sext <16 x i16> %b to <16 x i32> 402 %c = mul <16 x i32> %a1, %b1 403 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 404 %e = trunc <16 x i32> %d to <16 x i16> 405 ret <16 x i16> %e 406} 407 408define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { 409; SSE2-LABEL: ashr_mulhuw_v16i16: 410; SSE2: # %bb.0: 411; SSE2-NEXT: psrad $16, %xmm5 412; SSE2-NEXT: psrad $16, %xmm4 413; SSE2-NEXT: packssdw %xmm5, %xmm4 414; SSE2-NEXT: psrad $16, %xmm1 415; SSE2-NEXT: psrad $16, %xmm0 416; SSE2-NEXT: packssdw %xmm1, %xmm0 417; SSE2-NEXT: pmulhw %xmm4, %xmm0 418; SSE2-NEXT: psrad $16, %xmm7 419; SSE2-NEXT: psrad $16, %xmm6 420; SSE2-NEXT: packssdw %xmm7, %xmm6 421; SSE2-NEXT: psrad $16, %xmm3 422; SSE2-NEXT: psrad $16, %xmm2 423; SSE2-NEXT: packssdw %xmm3, %xmm2 424; SSE2-NEXT: pmulhw %xmm6, %xmm2 425; SSE2-NEXT: movdqa %xmm2, %xmm1 426; SSE2-NEXT: retq 427; 428; SSE41-LABEL: ashr_mulhuw_v16i16: 429; SSE41: # %bb.0: 430; SSE41-NEXT: psrld $16, %xmm3 431; SSE41-NEXT: psrld $16, %xmm2 432; SSE41-NEXT: packusdw %xmm3, %xmm2 433; SSE41-NEXT: psrld $16, %xmm1 434; SSE41-NEXT: psrld $16, %xmm0 435; SSE41-NEXT: packusdw %xmm1, %xmm0 436; SSE41-NEXT: psrld $16, %xmm7 437; SSE41-NEXT: psrld $16, %xmm6 438; SSE41-NEXT: packusdw %xmm7, %xmm6 439; SSE41-NEXT: pmulhw %xmm2, %xmm6 440; SSE41-NEXT: psrld $16, %xmm5 441; SSE41-NEXT: psrld $16, %xmm4 442; SSE41-NEXT: packusdw %xmm5, %xmm4 443; SSE41-NEXT: pmulhw %xmm4, %xmm0 444; SSE41-NEXT: movdqa %xmm6, %xmm1 445; SSE41-NEXT: retq 446; 447; AVX2-LABEL: ashr_mulhuw_v16i16: 448; AVX2: # %bb.0: 449; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 450; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 451; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 452; AVX2-NEXT: vpsrld $16, %ymm3, %ymm1 453; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 454; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 455; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 456; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 457; AVX2-NEXT: retq 458; 459; AVX512-LABEL: ashr_mulhuw_v16i16: 460; AVX512: # %bb.0: 461; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 462; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1 463; AVX512-NEXT: vpmovdw %zmm1, %ymm1 464; AVX512-NEXT: vpmovdw %zmm0, %ymm0 465; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 466; AVX512-NEXT: retq 467 %a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 468 %b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 469 %c = mul <16 x i32> %a1, %b1 470 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 471 %e = trunc <16 x i32> %d to <16 x i16> 472 ret <16 x i16> %e 473} 474 475define <16 x i16> @zext_mulhuw_v16i16_v16i48(<16 x i16> %a, <16 x i16> %b) { 476; SSE-LABEL: zext_mulhuw_v16i16_v16i48: 477; SSE: # %bb.0: 478; SSE-NEXT: pmulhuw %xmm2, %xmm0 479; SSE-NEXT: pmulhuw %xmm3, %xmm1 480; SSE-NEXT: retq 481; 482; AVX-LABEL: zext_mulhuw_v16i16_v16i48: 483; AVX: # %bb.0: 484; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 485; AVX-NEXT: retq 486 %a1 = zext <16 x i16> %a to <16 x i48> 487 %b1 = zext <16 x i16> %b to <16 x i48> 488 %c = mul <16 x i48> %a1, %b1 489 %d = lshr <16 x i48> %c, <i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16, i48 16> 490 %e = trunc <16 x i48> %d to <16 x i16> 491 ret <16 x i16> %e 492} 493 494define <32 x i16> @zext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { 495; SSE-LABEL: zext_mulhuw_v32i16: 496; SSE: # %bb.0: 497; SSE-NEXT: pmulhuw %xmm4, %xmm0 498; SSE-NEXT: pmulhuw %xmm5, %xmm1 499; SSE-NEXT: pmulhuw %xmm6, %xmm2 500; SSE-NEXT: pmulhuw %xmm7, %xmm3 501; SSE-NEXT: retq 502; 503; AVX2-LABEL: zext_mulhuw_v32i16: 504; AVX2: # %bb.0: 505; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 506; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 507; AVX2-NEXT: retq 508; 509; AVX512F-LABEL: zext_mulhuw_v32i16: 510; AVX512F: # %bb.0: 511; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 512; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 513; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2 514; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 515; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 516; AVX512F-NEXT: retq 517; 518; AVX512BW-LABEL: zext_mulhuw_v32i16: 519; AVX512BW: # %bb.0: 520; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 521; AVX512BW-NEXT: retq 522 %a1 = zext <32 x i16> %a to <32 x i32> 523 %b1 = zext <32 x i16> %b to <32 x i32> 524 %c = mul <32 x i32> %a1, %b1 525 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 526 %e = trunc <32 x i32> %d to <32 x i16> 527 ret <32 x i16> %e 528} 529 530define <32 x i16> @sext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { 531; SSE-LABEL: sext_mulhuw_v32i16: 532; SSE: # %bb.0: 533; SSE-NEXT: pmulhw %xmm4, %xmm0 534; SSE-NEXT: pmulhw %xmm5, %xmm1 535; SSE-NEXT: pmulhw %xmm6, %xmm2 536; SSE-NEXT: pmulhw %xmm7, %xmm3 537; SSE-NEXT: retq 538; 539; AVX2-LABEL: sext_mulhuw_v32i16: 540; AVX2: # %bb.0: 541; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 542; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 543; AVX2-NEXT: retq 544; 545; AVX512F-LABEL: sext_mulhuw_v32i16: 546; AVX512F: # %bb.0: 547; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 548; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 549; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2 550; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 551; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 552; AVX512F-NEXT: retq 553; 554; AVX512BW-LABEL: sext_mulhuw_v32i16: 555; AVX512BW: # %bb.0: 556; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 557; AVX512BW-NEXT: retq 558 %a1 = sext <32 x i16> %a to <32 x i32> 559 %b1 = sext <32 x i16> %b to <32 x i32> 560 %c = mul <32 x i32> %a1, %b1 561 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 562 %e = trunc <32 x i32> %d to <32 x i16> 563 ret <32 x i16> %e 564} 565 566define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { 567; SSE-LABEL: zext_mulhuw_v64i16: 568; SSE: # %bb.0: 569; SSE-NEXT: movq %rdi, %rax 570; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 571; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 572; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 573; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 574; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 575; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 576; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 577; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 578; SSE-NEXT: movdqa %xmm7, 112(%rdi) 579; SSE-NEXT: movdqa %xmm6, 96(%rdi) 580; SSE-NEXT: movdqa %xmm5, 80(%rdi) 581; SSE-NEXT: movdqa %xmm4, 64(%rdi) 582; SSE-NEXT: movdqa %xmm3, 48(%rdi) 583; SSE-NEXT: movdqa %xmm2, 32(%rdi) 584; SSE-NEXT: movdqa %xmm1, 16(%rdi) 585; SSE-NEXT: movdqa %xmm0, (%rdi) 586; SSE-NEXT: retq 587; 588; AVX2-LABEL: zext_mulhuw_v64i16: 589; AVX2: # %bb.0: 590; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 591; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 592; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 593; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 594; AVX2-NEXT: retq 595; 596; AVX512F-LABEL: zext_mulhuw_v64i16: 597; AVX512F: # %bb.0: 598; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 599; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 600; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4 601; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 602; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 603; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 604; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 605; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2 606; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 607; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 608; AVX512F-NEXT: retq 609; 610; AVX512BW-LABEL: zext_mulhuw_v64i16: 611; AVX512BW: # %bb.0: 612; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0 613; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1 614; AVX512BW-NEXT: retq 615 %a1 = zext <64 x i16> %a to <64 x i32> 616 %b1 = zext <64 x i16> %b to <64 x i32> 617 %c = mul <64 x i32> %a1, %b1 618 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 619 %e = trunc <64 x i32> %d to <64 x i16> 620 ret <64 x i16> %e 621} 622 623define <64 x i16> @sext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { 624; SSE-LABEL: sext_mulhuw_v64i16: 625; SSE: # %bb.0: 626; SSE-NEXT: movq %rdi, %rax 627; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 628; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 629; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 630; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 631; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 632; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 633; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 634; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 635; SSE-NEXT: movdqa %xmm7, 112(%rdi) 636; SSE-NEXT: movdqa %xmm6, 96(%rdi) 637; SSE-NEXT: movdqa %xmm5, 80(%rdi) 638; SSE-NEXT: movdqa %xmm4, 64(%rdi) 639; SSE-NEXT: movdqa %xmm3, 48(%rdi) 640; SSE-NEXT: movdqa %xmm2, 32(%rdi) 641; SSE-NEXT: movdqa %xmm1, 16(%rdi) 642; SSE-NEXT: movdqa %xmm0, (%rdi) 643; SSE-NEXT: retq 644; 645; AVX2-LABEL: sext_mulhuw_v64i16: 646; AVX2: # %bb.0: 647; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 648; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 649; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 650; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 651; AVX2-NEXT: retq 652; 653; AVX512F-LABEL: sext_mulhuw_v64i16: 654; AVX512F: # %bb.0: 655; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 656; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 657; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4 658; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 659; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 660; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 661; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 662; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2 663; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 664; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 665; AVX512F-NEXT: retq 666; 667; AVX512BW-LABEL: sext_mulhuw_v64i16: 668; AVX512BW: # %bb.0: 669; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0 670; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 671; AVX512BW-NEXT: retq 672 %a1 = sext <64 x i16> %a to <64 x i32> 673 %b1 = sext <64 x i16> %b to <64 x i32> 674 %c = mul <64 x i32> %a1, %b1 675 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 676 %e = trunc <64 x i32> %d to <64 x i16> 677 ret <64 x i16> %e 678} 679 680define <8 x i16> @zext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { 681; SSE-LABEL: zext_mulhuw_v8i16_i64: 682; SSE: # %bb.0: 683; SSE-NEXT: pmulhuw %xmm1, %xmm0 684; SSE-NEXT: retq 685; 686; AVX-LABEL: zext_mulhuw_v8i16_i64: 687; AVX: # %bb.0: 688; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 689; AVX-NEXT: retq 690 %a1 = zext <8 x i16> %a to <8 x i64> 691 %b1 = zext <8 x i16> %b to <8 x i64> 692 %c = mul <8 x i64> %a1, %b1 693 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 694 %e = trunc <8 x i64> %d to <8 x i16> 695 ret <8 x i16> %e 696} 697 698define <8 x i16> @sext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { 699; SSE-LABEL: sext_mulhuw_v8i16_i64: 700; SSE: # %bb.0: 701; SSE-NEXT: pmulhw %xmm1, %xmm0 702; SSE-NEXT: retq 703; 704; AVX-LABEL: sext_mulhuw_v8i16_i64: 705; AVX: # %bb.0: 706; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 707; AVX-NEXT: retq 708 %a1 = sext <8 x i16> %a to <8 x i64> 709 %b1 = sext <8 x i16> %b to <8 x i64> 710 %c = mul <8 x i64> %a1, %b1 711 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 712 %e = trunc <8 x i64> %d to <8 x i16> 713 ret <8 x i16> %e 714} 715 716define <4 x i32> @zext_mulhuw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { 717; SSE2-LABEL: zext_mulhuw_v4i16_lshr: 718; SSE2: # %bb.0: 719; SSE2-NEXT: pmulhuw %xmm1, %xmm0 720; SSE2-NEXT: pxor %xmm1, %xmm1 721; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 722; SSE2-NEXT: retq 723; 724; SSE41-LABEL: zext_mulhuw_v4i16_lshr: 725; SSE41: # %bb.0: 726; SSE41-NEXT: pmulhuw %xmm1, %xmm0 727; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 728; SSE41-NEXT: retq 729; 730; AVX-LABEL: zext_mulhuw_v4i16_lshr: 731; AVX: # %bb.0: 732; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 733; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 734; AVX-NEXT: retq 735 %a1 = zext <4 x i16> %a to <4 x i32> 736 %b1 = zext <4 x i16> %b to <4 x i32> 737 %c = mul <4 x i32> %a1, %b1 738 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 739 ret <4 x i32> %d 740} 741 742define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { 743; SSE2-LABEL: mulhsw_v4i16_lshr: 744; SSE2: # %bb.0: 745; SSE2-NEXT: pmulhw %xmm1, %xmm0 746; SSE2-NEXT: pxor %xmm1, %xmm1 747; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 748; SSE2-NEXT: retq 749; 750; SSE41-LABEL: mulhsw_v4i16_lshr: 751; SSE41: # %bb.0: 752; SSE41-NEXT: pmulhw %xmm1, %xmm0 753; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 754; SSE41-NEXT: retq 755; 756; AVX-LABEL: mulhsw_v4i16_lshr: 757; AVX: # %bb.0: 758; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 759; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 760; AVX-NEXT: retq 761 %a1 = sext <4 x i16> %a to <4 x i32> 762 %b1 = sext <4 x i16> %b to <4 x i32> 763 %c = mul <4 x i32> %a1, %b1 764 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 765 ret <4 x i32> %d 766} 767 768define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) { 769; SSE2-LABEL: mulhsw_v4i16_ashr: 770; SSE2: # %bb.0: 771; SSE2-NEXT: pmulhw %xmm1, %xmm0 772; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 773; SSE2-NEXT: psrad $16, %xmm0 774; SSE2-NEXT: retq 775; 776; SSE41-LABEL: mulhsw_v4i16_ashr: 777; SSE41: # %bb.0: 778; SSE41-NEXT: pmulhw %xmm1, %xmm0 779; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 780; SSE41-NEXT: retq 781; 782; AVX-LABEL: mulhsw_v4i16_ashr: 783; AVX: # %bb.0: 784; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 785; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 786; AVX-NEXT: retq 787 %a1 = sext <4 x i16> %a to <4 x i32> 788 %b1 = sext <4 x i16> %b to <4 x i32> 789 %c = mul <4 x i32> %a1, %b1 790 %d = ashr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 791 ret <4 x i32> %d 792} 793 794define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { 795; SSE2-LABEL: zext_mulhuw_v8i16_lshr: 796; SSE2: # %bb.0: 797; SSE2-NEXT: pmulhuw %xmm0, %xmm1 798; SSE2-NEXT: pxor %xmm2, %xmm2 799; SSE2-NEXT: movdqa %xmm1, %xmm0 800; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 801; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 802; SSE2-NEXT: retq 803; 804; SSE41-LABEL: zext_mulhuw_v8i16_lshr: 805; SSE41: # %bb.0: 806; SSE41-NEXT: movdqa %xmm0, %xmm2 807; SSE41-NEXT: pmulhuw %xmm1, %xmm2 808; SSE41-NEXT: pxor %xmm1, %xmm1 809; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 810; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 811; SSE41-NEXT: movdqa %xmm2, %xmm1 812; SSE41-NEXT: retq 813; 814; AVX-LABEL: zext_mulhuw_v8i16_lshr: 815; AVX: # %bb.0: 816; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 817; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 818; AVX-NEXT: retq 819 %a1 = zext <8 x i16> %a to <8 x i32> 820 %b1 = zext <8 x i16> %b to <8 x i32> 821 %c = mul <8 x i32> %a1, %b1 822 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 823 ret <8 x i32> %d 824} 825 826define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { 827; SSE2-LABEL: mulhsw_v8i16_lshr: 828; SSE2: # %bb.0: 829; SSE2-NEXT: pmulhw %xmm0, %xmm1 830; SSE2-NEXT: pxor %xmm2, %xmm2 831; SSE2-NEXT: movdqa %xmm1, %xmm0 832; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 833; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 834; SSE2-NEXT: retq 835; 836; SSE41-LABEL: mulhsw_v8i16_lshr: 837; SSE41: # %bb.0: 838; SSE41-NEXT: movdqa %xmm0, %xmm2 839; SSE41-NEXT: pmulhw %xmm1, %xmm2 840; SSE41-NEXT: pxor %xmm1, %xmm1 841; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 842; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 843; SSE41-NEXT: movdqa %xmm2, %xmm1 844; SSE41-NEXT: retq 845; 846; AVX-LABEL: mulhsw_v8i16_lshr: 847; AVX: # %bb.0: 848; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 849; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 850; AVX-NEXT: retq 851 %a1 = sext <8 x i16> %a to <8 x i32> 852 %b1 = sext <8 x i16> %b to <8 x i32> 853 %c = mul <8 x i32> %a1, %b1 854 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 855 ret <8 x i32> %d 856} 857 858define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) { 859; SSE2-LABEL: mulhsw_v8i16_ashr: 860; SSE2: # %bb.0: 861; SSE2-NEXT: pmulhw %xmm1, %xmm0 862; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 863; SSE2-NEXT: psrad $16, %xmm2 864; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 865; SSE2-NEXT: psrad $16, %xmm1 866; SSE2-NEXT: movdqa %xmm2, %xmm0 867; SSE2-NEXT: retq 868; 869; SSE41-LABEL: mulhsw_v8i16_ashr: 870; SSE41: # %bb.0: 871; SSE41-NEXT: pmulhw %xmm1, %xmm0 872; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 873; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 874; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 875; SSE41-NEXT: movdqa %xmm2, %xmm0 876; SSE41-NEXT: retq 877; 878; AVX-LABEL: mulhsw_v8i16_ashr: 879; AVX: # %bb.0: 880; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 881; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 882; AVX-NEXT: retq 883 %a1 = sext <8 x i16> %a to <8 x i32> 884 %b1 = sext <8 x i16> %b to <8 x i32> 885 %c = mul <8 x i32> %a1, %b1 886 %d = ashr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 887 ret <8 x i32> %d 888} 889 890define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { 891; SSE2-LABEL: zext_mulhuw_v16i16_lshr: 892; SSE2: # %bb.0: 893; SSE2-NEXT: movdqa %xmm0, %xmm4 894; SSE2-NEXT: pmulhuw %xmm2, %xmm4 895; SSE2-NEXT: pxor %xmm5, %xmm5 896; SSE2-NEXT: movdqa %xmm4, %xmm0 897; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] 898; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 899; SSE2-NEXT: pmulhuw %xmm1, %xmm3 900; SSE2-NEXT: movdqa %xmm3, %xmm2 901; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 902; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 903; SSE2-NEXT: movdqa %xmm4, %xmm1 904; SSE2-NEXT: retq 905; 906; SSE41-LABEL: zext_mulhuw_v16i16_lshr: 907; SSE41: # %bb.0: 908; SSE41-NEXT: movdqa %xmm1, %xmm4 909; SSE41-NEXT: movdqa %xmm0, %xmm1 910; SSE41-NEXT: pmulhuw %xmm2, %xmm1 911; SSE41-NEXT: pxor %xmm5, %xmm5 912; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 913; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 914; SSE41-NEXT: pmulhuw %xmm3, %xmm4 915; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 916; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 917; SSE41-NEXT: movdqa %xmm4, %xmm3 918; SSE41-NEXT: retq 919; 920; AVX2-LABEL: zext_mulhuw_v16i16_lshr: 921; AVX2: # %bb.0: 922; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1 923; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 924; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 925; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 926; AVX2-NEXT: retq 927; 928; AVX512-LABEL: zext_mulhuw_v16i16_lshr: 929; AVX512: # %bb.0: 930; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 931; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 932; AVX512-NEXT: retq 933 %a1 = zext <16 x i16> %a to <16 x i32> 934 %b1 = zext <16 x i16> %b to <16 x i32> 935 %c = mul <16 x i32> %a1, %b1 936 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 937 ret <16 x i32> %d 938} 939 940; PR109790 941define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) { 942; SSE-LABEL: PR109790: 943; SSE: # %bb.0: 944; SSE-NEXT: movq %rdi, %rax 945; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] 946; SSE-NEXT: movdqa (%rsi), %xmm1 947; SSE-NEXT: pand %xmm0, %xmm1 948; SSE-NEXT: pand 16(%rsi), %xmm0 949; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536] 950; SSE-NEXT: pmulhw %xmm2, %xmm0 951; SSE-NEXT: pmulhw %xmm2, %xmm1 952; SSE-NEXT: movdqa %xmm1, (%rdi) 953; SSE-NEXT: movdqa %xmm0, 16(%rdi) 954; SSE-NEXT: retq 955; 956; AVX2-LABEL: PR109790: 957; AVX2: # %bb.0: 958; AVX2-NEXT: movq %rdi, %rax 959; AVX2-NEXT: vmovdqa (%rsi), %ymm0 960; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 961; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536] 962; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 963; AVX2-NEXT: vzeroupper 964; AVX2-NEXT: retq 965; 966; AVX512F-LABEL: PR109790: 967; AVX512F: # %bb.0: 968; AVX512F-NEXT: movq %rdi, %rax 969; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 970; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 971; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 972; AVX512F-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 973; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 974; AVX512F-NEXT: vpmovdw %zmm0, (%rdi) 975; AVX512F-NEXT: vzeroupper 976; AVX512F-NEXT: retq 977; 978; AVX512BW-LABEL: PR109790: 979; AVX512BW: # %bb.0: 980; AVX512BW-NEXT: movq %rdi, %rax 981; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 982; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 983; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 984; AVX512BW-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0] 985; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 986; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) 987; AVX512BW-NEXT: vzeroupper 988; AVX512BW-NEXT: retq 989 %load = load <16 x i16>, ptr %a, align 32 990 %and = and <16 x i16> %load, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767> 991 %ext = zext nneg <16 x i16> %and to <16 x i32> 992 %mul = mul nsw <16 x i32> %ext, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000> 993 %srl = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 994 %res = trunc nuw <16 x i32> %srl to <16 x i16> 995 store <16 x i16> %res, ptr %ret, align 32 996 ret void 997} 998 999; PR109790 1000define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) { 1001; SSE-LABEL: zext_mulhuw_v16i16_negative_constant: 1002; SSE: # %bb.0: 1003; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 1004; SSE-NEXT: pand %xmm2, %xmm1 1005; SSE-NEXT: pand %xmm2, %xmm0 1006; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536] 1007; SSE-NEXT: pmulhw %xmm2, %xmm0 1008; SSE-NEXT: pmulhw %xmm2, %xmm1 1009; SSE-NEXT: retq 1010; 1011; AVX-LABEL: zext_mulhuw_v16i16_negative_constant: 1012; AVX: # %bb.0: 1013; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1014; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536] 1015; AVX-NEXT: retq 1016 %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767> 1017 %x = zext nneg <16 x i16> %k to <16 x i32> 1018 %m = mul nsw <16 x i32> %x, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000> 1019 %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1020 %t = trunc nuw <16 x i32> %s to <16 x i16> 1021 ret <16 x i16> %t 1022} 1023 1024; PR109790 1025define <16 x i16> @zext_mulhuw_v16i16_positive_constant(<16 x i16> %a) { 1026; SSE-LABEL: zext_mulhuw_v16i16_positive_constant: 1027; SSE: # %bb.0: 1028; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 1029; SSE-NEXT: pand %xmm2, %xmm1 1030; SSE-NEXT: pand %xmm2, %xmm0 1031; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1000,1000,1000,1000,1000,1000,1000,1000] 1032; SSE-NEXT: pmulhw %xmm2, %xmm0 1033; SSE-NEXT: pmulhw %xmm2, %xmm1 1034; SSE-NEXT: retq 1035; 1036; AVX-LABEL: zext_mulhuw_v16i16_positive_constant: 1037; AVX: # %bb.0: 1038; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1039; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000] 1040; AVX-NEXT: retq 1041 %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767> 1042 %x = zext nneg <16 x i16> %k to <16 x i32> 1043 %m = mul nuw nsw <16 x i32> %x, <i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000> 1044 %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1045 %t = trunc nuw nsw <16 x i32> %s to <16 x i16> 1046 ret <16 x i16> %t 1047} 1048 1049define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { 1050; SSE2-LABEL: mulhsw_v16i16_lshr: 1051; SSE2: # %bb.0: 1052; SSE2-NEXT: movdqa %xmm0, %xmm4 1053; SSE2-NEXT: pmulhw %xmm2, %xmm4 1054; SSE2-NEXT: pxor %xmm5, %xmm5 1055; SSE2-NEXT: movdqa %xmm4, %xmm0 1056; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] 1057; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 1058; SSE2-NEXT: pmulhw %xmm1, %xmm3 1059; SSE2-NEXT: movdqa %xmm3, %xmm2 1060; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 1061; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1062; SSE2-NEXT: movdqa %xmm4, %xmm1 1063; SSE2-NEXT: retq 1064; 1065; SSE41-LABEL: mulhsw_v16i16_lshr: 1066; SSE41: # %bb.0: 1067; SSE41-NEXT: movdqa %xmm1, %xmm4 1068; SSE41-NEXT: movdqa %xmm0, %xmm1 1069; SSE41-NEXT: pmulhw %xmm2, %xmm1 1070; SSE41-NEXT: pxor %xmm5, %xmm5 1071; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1072; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1073; SSE41-NEXT: pmulhw %xmm3, %xmm4 1074; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 1075; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 1076; SSE41-NEXT: movdqa %xmm4, %xmm3 1077; SSE41-NEXT: retq 1078; 1079; AVX2-LABEL: mulhsw_v16i16_lshr: 1080; AVX2: # %bb.0: 1081; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1 1082; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1083; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1084; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1085; AVX2-NEXT: retq 1086; 1087; AVX512-LABEL: mulhsw_v16i16_lshr: 1088; AVX512: # %bb.0: 1089; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1090; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1091; AVX512-NEXT: retq 1092 %a1 = sext <16 x i16> %a to <16 x i32> 1093 %b1 = sext <16 x i16> %b to <16 x i32> 1094 %c = mul <16 x i32> %a1, %b1 1095 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1096 ret <16 x i32> %d 1097} 1098 1099define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { 1100; SSE2-LABEL: mulhsw_v16i16_ashr: 1101; SSE2: # %bb.0: 1102; SSE2-NEXT: pmulhw %xmm2, %xmm0 1103; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 1104; SSE2-NEXT: psrad $16, %xmm5 1105; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 1106; SSE2-NEXT: psrad $16, %xmm4 1107; SSE2-NEXT: pmulhw %xmm3, %xmm1 1108; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1109; SSE2-NEXT: psrad $16, %xmm2 1110; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1111; SSE2-NEXT: psrad $16, %xmm3 1112; SSE2-NEXT: movdqa %xmm5, %xmm0 1113; SSE2-NEXT: movdqa %xmm4, %xmm1 1114; SSE2-NEXT: retq 1115; 1116; SSE41-LABEL: mulhsw_v16i16_ashr: 1117; SSE41: # %bb.0: 1118; SSE41-NEXT: pmulhw %xmm2, %xmm0 1119; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 1120; SSE41-NEXT: pmulhw %xmm3, %xmm1 1121; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 1122; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1123; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 1124; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1125; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 1126; SSE41-NEXT: movdqa %xmm4, %xmm0 1127; SSE41-NEXT: movdqa %xmm5, %xmm1 1128; SSE41-NEXT: retq 1129; 1130; AVX2-LABEL: mulhsw_v16i16_ashr: 1131; AVX2: # %bb.0: 1132; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1 1133; AVX2-NEXT: vpmovsxwd %xmm1, %ymm0 1134; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1135; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 1136; AVX2-NEXT: retq 1137; 1138; AVX512-LABEL: mulhsw_v16i16_ashr: 1139; AVX512: # %bb.0: 1140; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1141; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 1142; AVX512-NEXT: retq 1143 %a1 = sext <16 x i16> %a to <16 x i32> 1144 %b1 = sext <16 x i16> %b to <16 x i32> 1145 %c = mul <16 x i32> %a1, %b1 1146 %d = ashr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1147 ret <16 x i32> %d 1148} 1149 1150define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { 1151; SSE2-LABEL: zext_mulhuw_v32i16_lshr: 1152; SSE2: # %bb.0: 1153; SSE2-NEXT: movq %rdi, %rax 1154; SSE2-NEXT: pmulhuw %xmm4, %xmm0 1155; SSE2-NEXT: pxor %xmm4, %xmm4 1156; SSE2-NEXT: movdqa %xmm0, %xmm8 1157; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 1158; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1159; SSE2-NEXT: pmulhuw %xmm5, %xmm1 1160; SSE2-NEXT: movdqa %xmm1, %xmm5 1161; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1162; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1163; SSE2-NEXT: pmulhuw %xmm6, %xmm2 1164; SSE2-NEXT: movdqa %xmm2, %xmm6 1165; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 1166; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1167; SSE2-NEXT: pmulhuw %xmm7, %xmm3 1168; SSE2-NEXT: movdqa %xmm3, %xmm7 1169; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 1170; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1171; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1172; SSE2-NEXT: movdqa %xmm7, 96(%rdi) 1173; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1174; SSE2-NEXT: movdqa %xmm6, 64(%rdi) 1175; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1176; SSE2-NEXT: movdqa %xmm5, 32(%rdi) 1177; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1178; SSE2-NEXT: movdqa %xmm8, (%rdi) 1179; SSE2-NEXT: retq 1180; 1181; SSE41-LABEL: zext_mulhuw_v32i16_lshr: 1182; SSE41: # %bb.0: 1183; SSE41-NEXT: movq %rdi, %rax 1184; SSE41-NEXT: pmulhuw %xmm4, %xmm0 1185; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1186; SSE41-NEXT: pxor %xmm8, %xmm8 1187; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 1188; SSE41-NEXT: pmulhuw %xmm5, %xmm1 1189; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1190; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 1191; SSE41-NEXT: pmulhuw %xmm6, %xmm2 1192; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1193; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 1194; SSE41-NEXT: pmulhuw %xmm7, %xmm3 1195; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1196; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 1197; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 1198; SSE41-NEXT: movdqa %xmm7, 96(%rdi) 1199; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 1200; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1201; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 1202; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 1203; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1204; SSE41-NEXT: movdqa %xmm4, (%rdi) 1205; SSE41-NEXT: retq 1206; 1207; AVX2-LABEL: zext_mulhuw_v32i16_lshr: 1208; AVX2: # %bb.0: 1209; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 1210; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1211; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1212; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1213; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 1214; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1215; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1216; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1217; AVX2-NEXT: vmovdqa %ymm4, %ymm1 1218; AVX2-NEXT: retq 1219; 1220; AVX512F-LABEL: zext_mulhuw_v32i16_lshr: 1221; AVX512F: # %bb.0: 1222; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 1223; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1224; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1225; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1226; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 1227; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1228; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 1229; AVX512F-NEXT: retq 1230; 1231; AVX512BW-LABEL: zext_mulhuw_v32i16_lshr: 1232; AVX512BW: # %bb.0: 1233; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm1 1234; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1235; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1236; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1237; AVX512BW-NEXT: retq 1238 %a1 = zext <32 x i16> %a to <32 x i32> 1239 %b1 = zext <32 x i16> %b to <32 x i32> 1240 %c = mul <32 x i32> %a1, %b1 1241 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1242 ret <32 x i32> %d 1243} 1244 1245define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { 1246; SSE2-LABEL: mulhsw_v32i16_lshr: 1247; SSE2: # %bb.0: 1248; SSE2-NEXT: movq %rdi, %rax 1249; SSE2-NEXT: pmulhw %xmm4, %xmm0 1250; SSE2-NEXT: pxor %xmm4, %xmm4 1251; SSE2-NEXT: movdqa %xmm0, %xmm8 1252; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 1253; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1254; SSE2-NEXT: pmulhw %xmm5, %xmm1 1255; SSE2-NEXT: movdqa %xmm1, %xmm5 1256; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1257; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1258; SSE2-NEXT: pmulhw %xmm6, %xmm2 1259; SSE2-NEXT: movdqa %xmm2, %xmm6 1260; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 1261; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1262; SSE2-NEXT: pmulhw %xmm7, %xmm3 1263; SSE2-NEXT: movdqa %xmm3, %xmm7 1264; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 1265; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1266; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1267; SSE2-NEXT: movdqa %xmm7, 96(%rdi) 1268; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1269; SSE2-NEXT: movdqa %xmm6, 64(%rdi) 1270; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1271; SSE2-NEXT: movdqa %xmm5, 32(%rdi) 1272; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1273; SSE2-NEXT: movdqa %xmm8, (%rdi) 1274; SSE2-NEXT: retq 1275; 1276; SSE41-LABEL: mulhsw_v32i16_lshr: 1277; SSE41: # %bb.0: 1278; SSE41-NEXT: movq %rdi, %rax 1279; SSE41-NEXT: pmulhw %xmm4, %xmm0 1280; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1281; SSE41-NEXT: pxor %xmm8, %xmm8 1282; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 1283; SSE41-NEXT: pmulhw %xmm5, %xmm1 1284; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1285; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 1286; SSE41-NEXT: pmulhw %xmm6, %xmm2 1287; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1288; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 1289; SSE41-NEXT: pmulhw %xmm7, %xmm3 1290; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1291; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 1292; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 1293; SSE41-NEXT: movdqa %xmm7, 96(%rdi) 1294; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 1295; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1296; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 1297; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 1298; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1299; SSE41-NEXT: movdqa %xmm4, (%rdi) 1300; SSE41-NEXT: retq 1301; 1302; AVX2-LABEL: mulhsw_v32i16_lshr: 1303; AVX2: # %bb.0: 1304; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 1305; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1306; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1307; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1308; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 1309; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1310; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1311; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1312; AVX2-NEXT: vmovdqa %ymm4, %ymm1 1313; AVX2-NEXT: retq 1314; 1315; AVX512F-LABEL: mulhsw_v32i16_lshr: 1316; AVX512F: # %bb.0: 1317; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 1318; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1319; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1320; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1321; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1322; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1323; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 1324; AVX512F-NEXT: retq 1325; 1326; AVX512BW-LABEL: mulhsw_v32i16_lshr: 1327; AVX512BW: # %bb.0: 1328; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1 1329; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1330; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1331; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1332; AVX512BW-NEXT: retq 1333 %a1 = sext <32 x i16> %a to <32 x i32> 1334 %b1 = sext <32 x i16> %b to <32 x i32> 1335 %c = mul <32 x i32> %a1, %b1 1336 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1337 ret <32 x i32> %d 1338} 1339 1340define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { 1341; SSE2-LABEL: mulhsw_v32i16_ashr: 1342; SSE2: # %bb.0: 1343; SSE2-NEXT: movq %rdi, %rax 1344; SSE2-NEXT: pmulhw %xmm4, %xmm0 1345; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 1346; SSE2-NEXT: psrad $16, %xmm4 1347; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1348; SSE2-NEXT: psrad $16, %xmm0 1349; SSE2-NEXT: pmulhw %xmm5, %xmm1 1350; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 1351; SSE2-NEXT: psrad $16, %xmm5 1352; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1353; SSE2-NEXT: psrad $16, %xmm1 1354; SSE2-NEXT: pmulhw %xmm6, %xmm2 1355; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 1356; SSE2-NEXT: psrad $16, %xmm6 1357; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1358; SSE2-NEXT: psrad $16, %xmm2 1359; SSE2-NEXT: pmulhw %xmm7, %xmm3 1360; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 1361; SSE2-NEXT: psrad $16, %xmm7 1362; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1363; SSE2-NEXT: psrad $16, %xmm3 1364; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1365; SSE2-NEXT: movdqa %xmm7, 96(%rdi) 1366; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1367; SSE2-NEXT: movdqa %xmm6, 64(%rdi) 1368; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1369; SSE2-NEXT: movdqa %xmm5, 32(%rdi) 1370; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1371; SSE2-NEXT: movdqa %xmm4, (%rdi) 1372; SSE2-NEXT: retq 1373; 1374; SSE41-LABEL: mulhsw_v32i16_ashr: 1375; SSE41: # %bb.0: 1376; SSE41-NEXT: movq %rdi, %rax 1377; SSE41-NEXT: pmulhw %xmm4, %xmm0 1378; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 1379; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1380; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 1381; SSE41-NEXT: pmulhw %xmm5, %xmm1 1382; SSE41-NEXT: pmovsxwd %xmm1, %xmm5 1383; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1384; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 1385; SSE41-NEXT: pmulhw %xmm6, %xmm2 1386; SSE41-NEXT: pmovsxwd %xmm2, %xmm6 1387; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 1388; SSE41-NEXT: pmovsxwd %xmm2, %xmm2 1389; SSE41-NEXT: pmulhw %xmm7, %xmm3 1390; SSE41-NEXT: pmovsxwd %xmm3, %xmm7 1391; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1392; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 1393; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 1394; SSE41-NEXT: movdqa %xmm7, 96(%rdi) 1395; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 1396; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1397; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 1398; SSE41-NEXT: movdqa %xmm5, 32(%rdi) 1399; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1400; SSE41-NEXT: movdqa %xmm4, (%rdi) 1401; SSE41-NEXT: retq 1402; 1403; AVX2-LABEL: mulhsw_v32i16_ashr: 1404; AVX2: # %bb.0: 1405; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 1406; AVX2-NEXT: vpmovsxwd %xmm2, %ymm0 1407; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1408; AVX2-NEXT: vpmovsxwd %xmm2, %ymm4 1409; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 1410; AVX2-NEXT: vpmovsxwd %xmm1, %ymm2 1411; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1412; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3 1413; AVX2-NEXT: vmovdqa %ymm4, %ymm1 1414; AVX2-NEXT: retq 1415; 1416; AVX512F-LABEL: mulhsw_v32i16_ashr: 1417; AVX512F: # %bb.0: 1418; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 1419; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 1420; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1421; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1422; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 1423; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm1 1424; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 1425; AVX512F-NEXT: retq 1426; 1427; AVX512BW-LABEL: mulhsw_v32i16_ashr: 1428; AVX512BW: # %bb.0: 1429; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1 1430; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm0 1431; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1432; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1 1433; AVX512BW-NEXT: retq 1434 %a1 = sext <32 x i16> %a to <32 x i32> 1435 %b1 = sext <32 x i16> %b to <32 x i32> 1436 %c = mul <32 x i32> %a1, %b1 1437 %d = ashr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1438 ret <32 x i32> %d 1439} 1440 1441define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { 1442; SSE2-LABEL: zext_mulhuw_v64i16_lshr: 1443; SSE2: # %bb.0: 1444; SSE2-NEXT: movdqa %xmm7, %xmm8 1445; SSE2-NEXT: movq %rdi, %rax 1446; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 1447; SSE2-NEXT: pxor %xmm10, %xmm10 1448; SSE2-NEXT: movdqa %xmm0, %xmm7 1449; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1450; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1451; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1452; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 1453; SSE2-NEXT: movdqa %xmm1, %xmm9 1454; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 1455; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1456; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 1457; SSE2-NEXT: movdqa %xmm2, %xmm11 1458; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 1459; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 1460; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 1461; SSE2-NEXT: movdqa %xmm3, %xmm12 1462; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] 1463; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 1464; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 1465; SSE2-NEXT: movdqa %xmm4, %xmm13 1466; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1467; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 1468; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 1469; SSE2-NEXT: movdqa %xmm5, %xmm14 1470; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] 1471; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] 1472; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 1473; SSE2-NEXT: movdqa %xmm6, %xmm15 1474; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] 1475; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 1476; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 1477; SSE2-NEXT: movdqa %xmm8, %xmm7 1478; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1479; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 1480; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1481; SSE2-NEXT: movdqa %xmm7, 224(%rdi) 1482; SSE2-NEXT: movdqa %xmm6, 208(%rdi) 1483; SSE2-NEXT: movdqa %xmm15, 192(%rdi) 1484; SSE2-NEXT: movdqa %xmm5, 176(%rdi) 1485; SSE2-NEXT: movdqa %xmm14, 160(%rdi) 1486; SSE2-NEXT: movdqa %xmm4, 144(%rdi) 1487; SSE2-NEXT: movdqa %xmm13, 128(%rdi) 1488; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1489; SSE2-NEXT: movdqa %xmm12, 96(%rdi) 1490; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1491; SSE2-NEXT: movdqa %xmm11, 64(%rdi) 1492; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1493; SSE2-NEXT: movdqa %xmm9, 32(%rdi) 1494; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1495; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1496; SSE2-NEXT: movaps %xmm0, (%rdi) 1497; SSE2-NEXT: retq 1498; 1499; SSE41-LABEL: zext_mulhuw_v64i16_lshr: 1500; SSE41: # %bb.0: 1501; SSE41-NEXT: movdqa %xmm0, %xmm8 1502; SSE41-NEXT: movq %rdi, %rax 1503; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 1504; SSE41-NEXT: pxor %xmm11, %xmm11 1505; SSE41-NEXT: movdqa %xmm8, %xmm0 1506; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1507; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1508; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 1509; SSE41-NEXT: movdqa %xmm1, %xmm9 1510; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 1511; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 1512; SSE41-NEXT: movdqa %xmm2, %xmm10 1513; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 1514; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 1515; SSE41-NEXT: movdqa %xmm3, %xmm12 1516; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 1517; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 1518; SSE41-NEXT: movdqa %xmm4, %xmm13 1519; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 1520; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 1521; SSE41-NEXT: movdqa %xmm5, %xmm14 1522; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1523; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 1524; SSE41-NEXT: movdqa %xmm6, %xmm15 1525; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 1526; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 1527; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 1528; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] 1529; SSE41-NEXT: movdqa %xmm7, 240(%rdi) 1530; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 1531; SSE41-NEXT: movdqa %xmm15, 208(%rdi) 1532; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 1533; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 1534; SSE41-NEXT: movdqa %xmm14, 176(%rdi) 1535; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 1536; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 1537; SSE41-NEXT: movdqa %xmm13, 144(%rdi) 1538; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 1539; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 1540; SSE41-NEXT: movdqa %xmm12, 112(%rdi) 1541; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1542; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 1543; SSE41-NEXT: movdqa %xmm10, 80(%rdi) 1544; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1545; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 1546; SSE41-NEXT: movdqa %xmm9, 48(%rdi) 1547; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1548; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 1549; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1550; SSE41-NEXT: movaps %xmm0, 16(%rdi) 1551; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 1552; SSE41-NEXT: movdqa %xmm0, (%rdi) 1553; SSE41-NEXT: retq 1554; 1555; AVX2-LABEL: zext_mulhuw_v64i16_lshr: 1556; AVX2: # %bb.0: 1557; AVX2-NEXT: movq %rdi, %rax 1558; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 1559; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1560; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1561; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1562; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 1563; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1564; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1565; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1566; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 1567; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1568; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1569; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1570; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 1571; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1572; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1573; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1574; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1575; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1576; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1577; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1578; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1579; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1580; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1581; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1582; AVX2-NEXT: vzeroupper 1583; AVX2-NEXT: retq 1584; 1585; AVX512F-LABEL: zext_mulhuw_v64i16_lshr: 1586; AVX512F: # %bb.0: 1587; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 1588; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 1589; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1590; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1591; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 1592; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1593; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm0 1594; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1595; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1596; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1597; AVX512F-NEXT: vpmulhuw %ymm0, %ymm1, %ymm0 1598; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1599; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1600; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1601; AVX512F-NEXT: retq 1602; 1603; AVX512BW-LABEL: zext_mulhuw_v64i16_lshr: 1604; AVX512BW: # %bb.0: 1605; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm2 1606; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1607; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1608; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1609; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1 1610; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1611; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1612; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1613; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1614; AVX512BW-NEXT: retq 1615 %a1 = zext <64 x i16> %a to <64 x i32> 1616 %b1 = zext <64 x i16> %b to <64 x i32> 1617 %c = mul <64 x i32> %a1, %b1 1618 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1619 ret <64 x i32> %d 1620} 1621 1622define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { 1623; SSE2-LABEL: mulhsw_v64i16_lshr: 1624; SSE2: # %bb.0: 1625; SSE2-NEXT: movdqa %xmm7, %xmm8 1626; SSE2-NEXT: movq %rdi, %rax 1627; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1628; SSE2-NEXT: pxor %xmm10, %xmm10 1629; SSE2-NEXT: movdqa %xmm0, %xmm7 1630; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1631; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1632; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1633; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1634; SSE2-NEXT: movdqa %xmm1, %xmm9 1635; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 1636; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 1637; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1638; SSE2-NEXT: movdqa %xmm2, %xmm11 1639; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 1640; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 1641; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1642; SSE2-NEXT: movdqa %xmm3, %xmm12 1643; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] 1644; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 1645; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1646; SSE2-NEXT: movdqa %xmm4, %xmm13 1647; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1648; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 1649; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1650; SSE2-NEXT: movdqa %xmm5, %xmm14 1651; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] 1652; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] 1653; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1654; SSE2-NEXT: movdqa %xmm6, %xmm15 1655; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] 1656; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 1657; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 1658; SSE2-NEXT: movdqa %xmm8, %xmm7 1659; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1660; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 1661; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1662; SSE2-NEXT: movdqa %xmm7, 224(%rdi) 1663; SSE2-NEXT: movdqa %xmm6, 208(%rdi) 1664; SSE2-NEXT: movdqa %xmm15, 192(%rdi) 1665; SSE2-NEXT: movdqa %xmm5, 176(%rdi) 1666; SSE2-NEXT: movdqa %xmm14, 160(%rdi) 1667; SSE2-NEXT: movdqa %xmm4, 144(%rdi) 1668; SSE2-NEXT: movdqa %xmm13, 128(%rdi) 1669; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1670; SSE2-NEXT: movdqa %xmm12, 96(%rdi) 1671; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1672; SSE2-NEXT: movdqa %xmm11, 64(%rdi) 1673; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1674; SSE2-NEXT: movdqa %xmm9, 32(%rdi) 1675; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1676; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1677; SSE2-NEXT: movaps %xmm0, (%rdi) 1678; SSE2-NEXT: retq 1679; 1680; SSE41-LABEL: mulhsw_v64i16_lshr: 1681; SSE41: # %bb.0: 1682; SSE41-NEXT: movdqa %xmm0, %xmm8 1683; SSE41-NEXT: movq %rdi, %rax 1684; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 1685; SSE41-NEXT: pxor %xmm11, %xmm11 1686; SSE41-NEXT: movdqa %xmm8, %xmm0 1687; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1688; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1689; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1690; SSE41-NEXT: movdqa %xmm1, %xmm9 1691; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 1692; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1693; SSE41-NEXT: movdqa %xmm2, %xmm10 1694; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 1695; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1696; SSE41-NEXT: movdqa %xmm3, %xmm12 1697; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 1698; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1699; SSE41-NEXT: movdqa %xmm4, %xmm13 1700; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 1701; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1702; SSE41-NEXT: movdqa %xmm5, %xmm14 1703; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1704; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1705; SSE41-NEXT: movdqa %xmm6, %xmm15 1706; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] 1707; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1708; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 1709; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] 1710; SSE41-NEXT: movdqa %xmm7, 240(%rdi) 1711; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 1712; SSE41-NEXT: movdqa %xmm15, 208(%rdi) 1713; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 1714; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 1715; SSE41-NEXT: movdqa %xmm14, 176(%rdi) 1716; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero 1717; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 1718; SSE41-NEXT: movdqa %xmm13, 144(%rdi) 1719; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero 1720; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 1721; SSE41-NEXT: movdqa %xmm12, 112(%rdi) 1722; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1723; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 1724; SSE41-NEXT: movdqa %xmm10, 80(%rdi) 1725; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1726; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 1727; SSE41-NEXT: movdqa %xmm9, 48(%rdi) 1728; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1729; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 1730; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1731; SSE41-NEXT: movaps %xmm0, 16(%rdi) 1732; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero 1733; SSE41-NEXT: movdqa %xmm0, (%rdi) 1734; SSE41-NEXT: retq 1735; 1736; AVX2-LABEL: mulhsw_v64i16_lshr: 1737; AVX2: # %bb.0: 1738; AVX2-NEXT: movq %rdi, %rax 1739; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 1740; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1741; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1742; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1743; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 1744; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1745; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1746; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1747; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 1748; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1749; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1750; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1751; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 1752; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1753; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1754; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1755; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1756; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1757; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1758; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1759; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1760; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1761; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1762; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1763; AVX2-NEXT: vzeroupper 1764; AVX2-NEXT: retq 1765; 1766; AVX512F-LABEL: mulhsw_v64i16_lshr: 1767; AVX512F: # %bb.0: 1768; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 1769; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 1770; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1771; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1772; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 1773; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1774; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0 1775; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1776; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1777; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1778; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 1779; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1780; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1781; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1782; AVX512F-NEXT: retq 1783; 1784; AVX512BW-LABEL: mulhsw_v64i16_lshr: 1785; AVX512BW: # %bb.0: 1786; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2 1787; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1788; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1789; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1790; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 1791; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1792; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1793; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1794; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1795; AVX512BW-NEXT: retq 1796 %a1 = sext <64 x i16> %a to <64 x i32> 1797 %b1 = sext <64 x i16> %b to <64 x i32> 1798 %c = mul <64 x i32> %a1, %b1 1799 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1800 ret <64 x i32> %d 1801} 1802 1803define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { 1804; SSE2-LABEL: mulhsw_v64i16_ashr: 1805; SSE2: # %bb.0: 1806; SSE2-NEXT: movq %rdi, %rax 1807; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1808; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] 1809; SSE2-NEXT: psrad $16, %xmm8 1810; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1811; SSE2-NEXT: psrad $16, %xmm0 1812; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1813; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] 1814; SSE2-NEXT: psrad $16, %xmm9 1815; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1816; SSE2-NEXT: psrad $16, %xmm1 1817; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1818; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] 1819; SSE2-NEXT: psrad $16, %xmm10 1820; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1821; SSE2-NEXT: psrad $16, %xmm2 1822; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1823; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] 1824; SSE2-NEXT: psrad $16, %xmm11 1825; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1826; SSE2-NEXT: psrad $16, %xmm3 1827; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1828; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] 1829; SSE2-NEXT: psrad $16, %xmm12 1830; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] 1831; SSE2-NEXT: psrad $16, %xmm4 1832; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1833; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] 1834; SSE2-NEXT: psrad $16, %xmm13 1835; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1836; SSE2-NEXT: psrad $16, %xmm5 1837; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1838; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] 1839; SSE2-NEXT: psrad $16, %xmm14 1840; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1841; SSE2-NEXT: psrad $16, %xmm6 1842; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1843; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] 1844; SSE2-NEXT: psrad $16, %xmm15 1845; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1846; SSE2-NEXT: psrad $16, %xmm7 1847; SSE2-NEXT: movdqa %xmm7, 240(%rdi) 1848; SSE2-NEXT: movdqa %xmm15, 224(%rdi) 1849; SSE2-NEXT: movdqa %xmm6, 208(%rdi) 1850; SSE2-NEXT: movdqa %xmm14, 192(%rdi) 1851; SSE2-NEXT: movdqa %xmm5, 176(%rdi) 1852; SSE2-NEXT: movdqa %xmm13, 160(%rdi) 1853; SSE2-NEXT: movdqa %xmm4, 144(%rdi) 1854; SSE2-NEXT: movdqa %xmm12, 128(%rdi) 1855; SSE2-NEXT: movdqa %xmm3, 112(%rdi) 1856; SSE2-NEXT: movdqa %xmm11, 96(%rdi) 1857; SSE2-NEXT: movdqa %xmm2, 80(%rdi) 1858; SSE2-NEXT: movdqa %xmm10, 64(%rdi) 1859; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1860; SSE2-NEXT: movdqa %xmm9, 32(%rdi) 1861; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1862; SSE2-NEXT: movdqa %xmm8, (%rdi) 1863; SSE2-NEXT: retq 1864; 1865; SSE41-LABEL: mulhsw_v64i16_ashr: 1866; SSE41: # %bb.0: 1867; SSE41-NEXT: movq %rdi, %rax 1868; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 1869; SSE41-NEXT: pmovsxwd %xmm0, %xmm8 1870; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1871; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 1872; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 1873; SSE41-NEXT: pmovsxwd %xmm1, %xmm9 1874; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1875; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 1876; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 1877; SSE41-NEXT: pmovsxwd %xmm2, %xmm10 1878; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 1879; SSE41-NEXT: pmovsxwd %xmm2, %xmm2 1880; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 1881; SSE41-NEXT: pmovsxwd %xmm3, %xmm11 1882; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1883; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 1884; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 1885; SSE41-NEXT: pmovsxwd %xmm4, %xmm12 1886; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 1887; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 1888; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 1889; SSE41-NEXT: pmovsxwd %xmm5, %xmm13 1890; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 1891; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 1892; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 1893; SSE41-NEXT: pmovsxwd %xmm6, %xmm14 1894; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 1895; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 1896; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 1897; SSE41-NEXT: pmovsxwd %xmm7, %xmm15 1898; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] 1899; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 1900; SSE41-NEXT: movdqa %xmm7, 240(%rdi) 1901; SSE41-NEXT: movdqa %xmm15, 224(%rdi) 1902; SSE41-NEXT: movdqa %xmm6, 208(%rdi) 1903; SSE41-NEXT: movdqa %xmm14, 192(%rdi) 1904; SSE41-NEXT: movdqa %xmm5, 176(%rdi) 1905; SSE41-NEXT: movdqa %xmm13, 160(%rdi) 1906; SSE41-NEXT: movdqa %xmm4, 144(%rdi) 1907; SSE41-NEXT: movdqa %xmm12, 128(%rdi) 1908; SSE41-NEXT: movdqa %xmm3, 112(%rdi) 1909; SSE41-NEXT: movdqa %xmm11, 96(%rdi) 1910; SSE41-NEXT: movdqa %xmm2, 80(%rdi) 1911; SSE41-NEXT: movdqa %xmm10, 64(%rdi) 1912; SSE41-NEXT: movdqa %xmm1, 48(%rdi) 1913; SSE41-NEXT: movdqa %xmm9, 32(%rdi) 1914; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1915; SSE41-NEXT: movdqa %xmm8, (%rdi) 1916; SSE41-NEXT: retq 1917; 1918; AVX2-LABEL: mulhsw_v64i16_ashr: 1919; AVX2: # %bb.0: 1920; AVX2-NEXT: movq %rdi, %rax 1921; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 1922; AVX2-NEXT: vpmovsxwd %xmm0, %ymm4 1923; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1924; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1925; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 1926; AVX2-NEXT: vpmovsxwd %xmm1, %ymm5 1927; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1928; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 1929; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 1930; AVX2-NEXT: vpmovsxwd %xmm2, %ymm6 1931; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 1932; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 1933; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 1934; AVX2-NEXT: vpmovsxwd %xmm3, %ymm7 1935; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 1936; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 1937; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 1938; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi) 1939; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi) 1940; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi) 1941; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi) 1942; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 1943; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 1944; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1945; AVX2-NEXT: vzeroupper 1946; AVX2-NEXT: retq 1947; 1948; AVX512F-LABEL: mulhsw_v64i16_ashr: 1949; AVX512F: # %bb.0: 1950; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 1951; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 1952; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1953; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1954; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 1955; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm5 1956; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0 1957; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 1958; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1959; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1960; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 1961; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm3 1962; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 1963; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 1964; AVX512F-NEXT: retq 1965; 1966; AVX512BW-LABEL: mulhsw_v64i16_ashr: 1967; AVX512BW: # %bb.0: 1968; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2 1969; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm0 1970; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1971; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm4 1972; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 1973; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm2 1974; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1975; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3 1976; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 1977; AVX512BW-NEXT: retq 1978 %a1 = sext <64 x i16> %a to <64 x i32> 1979 %b1 = sext <64 x i16> %b to <64 x i32> 1980 %c = mul <64 x i32> %a1, %b1 1981 %d = ashr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1982 ret <64 x i32> %d 1983} 1984 1985define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { 1986; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64: 1987; SSE2: # %bb.0: 1988; SSE2-NEXT: movdqa %xmm0, %xmm3 1989; SSE2-NEXT: pmulhuw %xmm1, %xmm3 1990; SSE2-NEXT: pxor %xmm4, %xmm4 1991; SSE2-NEXT: movdqa %xmm3, %xmm1 1992; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1993; SSE2-NEXT: movdqa %xmm1, %xmm0 1994; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1995; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1996; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1997; SSE2-NEXT: movdqa %xmm3, %xmm2 1998; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 1999; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 2000; SSE2-NEXT: retq 2001; 2002; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64: 2003; SSE41: # %bb.0: 2004; SSE41-NEXT: pmulhuw %xmm1, %xmm0 2005; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2006; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2007; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 2008; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2009; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 2010; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2011; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2012; SSE41-NEXT: movdqa %xmm4, %xmm0 2013; SSE41-NEXT: retq 2014; 2015; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64: 2016; AVX2: # %bb.0: 2017; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 2018; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 2019; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 2020; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 2021; AVX2-NEXT: retq 2022; 2023; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64: 2024; AVX512: # %bb.0: 2025; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2026; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2027; AVX512-NEXT: retq 2028 %a1 = zext <8 x i16> %a to <8 x i64> 2029 %b1 = zext <8 x i16> %b to <8 x i64> 2030 %c = mul <8 x i64> %a1, %b1 2031 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 2032 ret <8 x i64> %d 2033} 2034 2035define <8 x i64> @sext_mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { 2036; SSE2-LABEL: sext_mulhsw_v8i16_lshr_i64: 2037; SSE2: # %bb.0: 2038; SSE2-NEXT: movdqa %xmm0, %xmm3 2039; SSE2-NEXT: pmulhw %xmm1, %xmm3 2040; SSE2-NEXT: pxor %xmm4, %xmm4 2041; SSE2-NEXT: movdqa %xmm3, %xmm1 2042; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 2043; SSE2-NEXT: movdqa %xmm1, %xmm0 2044; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2045; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 2046; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2047; SSE2-NEXT: movdqa %xmm3, %xmm2 2048; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2049; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] 2050; SSE2-NEXT: retq 2051; 2052; SSE41-LABEL: sext_mulhsw_v8i16_lshr_i64: 2053; SSE41: # %bb.0: 2054; SSE41-NEXT: pmulhw %xmm1, %xmm0 2055; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2056; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2057; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 2058; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2059; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 2060; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2061; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2062; SSE41-NEXT: movdqa %xmm4, %xmm0 2063; SSE41-NEXT: retq 2064; 2065; AVX2-LABEL: sext_mulhsw_v8i16_lshr_i64: 2066; AVX2: # %bb.0: 2067; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 2068; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 2069; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 2070; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 2071; AVX2-NEXT: retq 2072; 2073; AVX512-LABEL: sext_mulhsw_v8i16_lshr_i64: 2074; AVX512: # %bb.0: 2075; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 2076; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2077; AVX512-NEXT: retq 2078 %a1 = sext <8 x i16> %a to <8 x i64> 2079 %b1 = sext <8 x i16> %b to <8 x i64> 2080 %c = mul <8 x i64> %a1, %b1 2081 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 2082 ret <8 x i64> %d 2083} 2084 2085define <8 x i64> @sext_mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { 2086; SSE2-LABEL: sext_mulhsw_v8i16_ashr_i64: 2087; SSE2: # %bb.0: 2088; SSE2-NEXT: pmulhw %xmm1, %xmm0 2089; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2090; SSE2-NEXT: psrad $16, %xmm1 2091; SSE2-NEXT: pxor %xmm5, %xmm5 2092; SSE2-NEXT: pxor %xmm2, %xmm2 2093; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 2094; SSE2-NEXT: movdqa %xmm1, %xmm4 2095; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 2096; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 2097; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2098; SSE2-NEXT: psrad $16, %xmm3 2099; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 2100; SSE2-NEXT: movdqa %xmm3, %xmm2 2101; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 2102; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 2103; SSE2-NEXT: movdqa %xmm4, %xmm0 2104; SSE2-NEXT: retq 2105; 2106; SSE41-LABEL: sext_mulhsw_v8i16_ashr_i64: 2107; SSE41: # %bb.0: 2108; SSE41-NEXT: pmulhw %xmm1, %xmm0 2109; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 2110; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2111; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 2112; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2113; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 2114; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2115; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 2116; SSE41-NEXT: movdqa %xmm4, %xmm0 2117; SSE41-NEXT: retq 2118; 2119; AVX2-LABEL: sext_mulhsw_v8i16_ashr_i64: 2120; AVX2: # %bb.0: 2121; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 2122; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 2123; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 2124; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 2125; AVX2-NEXT: retq 2126; 2127; AVX512-LABEL: sext_mulhsw_v8i16_ashr_i64: 2128; AVX512: # %bb.0: 2129; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 2130; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 2131; AVX512-NEXT: retq 2132 %a1 = sext <8 x i16> %a to <8 x i64> 2133 %b1 = sext <8 x i16> %b to <8 x i64> 2134 %c = mul <8 x i64> %a1, %b1 2135 %d = ashr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16> 2136 ret <8 x i64> %d 2137} 2138 2139define <8 x i16> @sse2_pmulh_w_const(<8 x i16> %a0, <8 x i16> %a1) { 2140; SSE-LABEL: sse2_pmulh_w_const: 2141; SSE: # %bb.0: 2142; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] 2143; SSE-NEXT: retq 2144; 2145; AVX-LABEL: sse2_pmulh_w_const: 2146; AVX: # %bb.0: 2147; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] 2148; AVX-NEXT: retq 2149 %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 2150 ret <8 x i16> %res 2151} 2152declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) 2153 2154define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) { 2155; SSE-LABEL: sse2_pmulhu_w_const: 2156; SSE: # %bb.0: 2157; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0] 2158; SSE-NEXT: retq 2159; 2160; AVX-LABEL: sse2_pmulhu_w_const: 2161; AVX: # %bb.0: 2162; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0] 2163; AVX-NEXT: retq 2164 %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 2165 ret <8 x i16> %res 2166} 2167declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) 2168 2169