1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 15 16; 17; Variable Shifts 18; 19 20define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i32: 22; SSE2: # %bb.0: 23; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 24; SSE2-NEXT: movdqa %xmm0, %xmm3 25; SSE2-NEXT: psrad %xmm2, %xmm3 26; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 27; SSE2-NEXT: movdqa %xmm0, %xmm2 28; SSE2-NEXT: psrad %xmm4, %xmm2 29; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 30; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 31; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 32; SSE2-NEXT: movdqa %xmm0, %xmm4 33; SSE2-NEXT: psrad %xmm3, %xmm4 34; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 35; SSE2-NEXT: psrad %xmm1, %xmm0 36; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 37; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 38; SSE2-NEXT: movaps %xmm2, %xmm0 39; SSE2-NEXT: retq 40; 41; SSE41-LABEL: var_shift_v2i32: 42; SSE41: # %bb.0: 43; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 44; SSE41-NEXT: movdqa %xmm0, %xmm3 45; SSE41-NEXT: psrad %xmm2, %xmm3 46; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 47; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 48; SSE41-NEXT: movdqa %xmm0, %xmm5 49; SSE41-NEXT: psrad %xmm4, %xmm5 50; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 51; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 52; SSE41-NEXT: movdqa %xmm0, %xmm3 53; SSE41-NEXT: psrad %xmm1, %xmm3 54; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 55; SSE41-NEXT: psrad %xmm1, %xmm0 56; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 57; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 58; SSE41-NEXT: retq 59; 60; AVX1-LABEL: var_shift_v2i32: 61; AVX1: # %bb.0: 62; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 63; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 64; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 65; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 66; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 67; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 68; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 69; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 70; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 71; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 72; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 73; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 74; AVX1-NEXT: retq 75; 76; AVX2-LABEL: var_shift_v2i32: 77; AVX2: # %bb.0: 78; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 79; AVX2-NEXT: retq 80; 81; XOPAVX1-LABEL: var_shift_v2i32: 82; XOPAVX1: # %bb.0: 83; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 84; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 85; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 86; XOPAVX1-NEXT: retq 87; 88; XOPAVX2-LABEL: var_shift_v2i32: 89; XOPAVX2: # %bb.0: 90; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 91; XOPAVX2-NEXT: retq 92; 93; AVX512-LABEL: var_shift_v2i32: 94; AVX512: # %bb.0: 95; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 96; AVX512-NEXT: retq 97; 98; AVX512VL-LABEL: var_shift_v2i32: 99; AVX512VL: # %bb.0: 100; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 101; AVX512VL-NEXT: retq 102; 103; X86-SSE-LABEL: var_shift_v2i32: 104; X86-SSE: # %bb.0: 105; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 106; X86-SSE-NEXT: movdqa %xmm0, %xmm3 107; X86-SSE-NEXT: psrad %xmm2, %xmm3 108; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 109; X86-SSE-NEXT: movdqa %xmm0, %xmm2 110; X86-SSE-NEXT: psrad %xmm4, %xmm2 111; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 112; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 113; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 114; X86-SSE-NEXT: movdqa %xmm0, %xmm4 115; X86-SSE-NEXT: psrad %xmm3, %xmm4 116; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 117; X86-SSE-NEXT: psrad %xmm1, %xmm0 118; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 119; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 120; X86-SSE-NEXT: movaps %xmm2, %xmm0 121; X86-SSE-NEXT: retl 122 %shift = ashr <2 x i32> %a, %b 123 ret <2 x i32> %shift 124} 125 126define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 127; SSE2-LABEL: var_shift_v4i16: 128; SSE2: # %bb.0: 129; SSE2-NEXT: psllw $12, %xmm1 130; SSE2-NEXT: movdqa %xmm1, %xmm2 131; SSE2-NEXT: psraw $15, %xmm2 132; SSE2-NEXT: movdqa %xmm2, %xmm3 133; SSE2-NEXT: pandn %xmm0, %xmm3 134; SSE2-NEXT: psraw $8, %xmm0 135; SSE2-NEXT: pand %xmm2, %xmm0 136; SSE2-NEXT: por %xmm3, %xmm0 137; SSE2-NEXT: paddw %xmm1, %xmm1 138; SSE2-NEXT: movdqa %xmm1, %xmm2 139; SSE2-NEXT: psraw $15, %xmm2 140; SSE2-NEXT: movdqa %xmm2, %xmm3 141; SSE2-NEXT: pandn %xmm0, %xmm3 142; SSE2-NEXT: psraw $4, %xmm0 143; SSE2-NEXT: pand %xmm2, %xmm0 144; SSE2-NEXT: por %xmm3, %xmm0 145; SSE2-NEXT: paddw %xmm1, %xmm1 146; SSE2-NEXT: movdqa %xmm1, %xmm2 147; SSE2-NEXT: psraw $15, %xmm2 148; SSE2-NEXT: movdqa %xmm2, %xmm3 149; SSE2-NEXT: pandn %xmm0, %xmm3 150; SSE2-NEXT: psraw $2, %xmm0 151; SSE2-NEXT: pand %xmm2, %xmm0 152; SSE2-NEXT: por %xmm3, %xmm0 153; SSE2-NEXT: paddw %xmm1, %xmm1 154; SSE2-NEXT: psraw $15, %xmm1 155; SSE2-NEXT: movdqa %xmm1, %xmm2 156; SSE2-NEXT: pandn %xmm0, %xmm2 157; SSE2-NEXT: psraw $1, %xmm0 158; SSE2-NEXT: pand %xmm1, %xmm0 159; SSE2-NEXT: por %xmm2, %xmm0 160; SSE2-NEXT: retq 161; 162; SSE41-LABEL: var_shift_v4i16: 163; SSE41: # %bb.0: 164; SSE41-NEXT: movdqa %xmm0, %xmm2 165; SSE41-NEXT: movdqa %xmm1, %xmm0 166; SSE41-NEXT: psllw $12, %xmm0 167; SSE41-NEXT: psllw $4, %xmm1 168; SSE41-NEXT: por %xmm1, %xmm0 169; SSE41-NEXT: movdqa %xmm0, %xmm1 170; SSE41-NEXT: paddw %xmm0, %xmm1 171; SSE41-NEXT: movdqa %xmm2, %xmm3 172; SSE41-NEXT: psraw $8, %xmm3 173; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 174; SSE41-NEXT: movdqa %xmm2, %xmm3 175; SSE41-NEXT: psraw $4, %xmm3 176; SSE41-NEXT: movdqa %xmm1, %xmm0 177; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 178; SSE41-NEXT: movdqa %xmm2, %xmm3 179; SSE41-NEXT: psraw $2, %xmm3 180; SSE41-NEXT: paddw %xmm1, %xmm1 181; SSE41-NEXT: movdqa %xmm1, %xmm0 182; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 183; SSE41-NEXT: movdqa %xmm2, %xmm3 184; SSE41-NEXT: psraw $1, %xmm3 185; SSE41-NEXT: paddw %xmm1, %xmm1 186; SSE41-NEXT: movdqa %xmm1, %xmm0 187; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 188; SSE41-NEXT: movdqa %xmm2, %xmm0 189; SSE41-NEXT: retq 190; 191; AVX1-LABEL: var_shift_v4i16: 192; AVX1: # %bb.0: 193; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 194; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 195; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 196; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 197; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 198; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 199; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 200; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 201; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 202; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 203; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 204; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 205; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 206; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 207; AVX1-NEXT: retq 208; 209; AVX2-LABEL: var_shift_v4i16: 210; AVX2: # %bb.0: 211; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 212; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 213; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 214; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 215; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 216; AVX2-NEXT: vzeroupper 217; AVX2-NEXT: retq 218; 219; XOP-LABEL: var_shift_v4i16: 220; XOP: # %bb.0: 221; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 222; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 223; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 224; XOP-NEXT: retq 225; 226; AVX512DQ-LABEL: var_shift_v4i16: 227; AVX512DQ: # %bb.0: 228; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 229; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 230; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 231; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 232; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 233; AVX512DQ-NEXT: vzeroupper 234; AVX512DQ-NEXT: retq 235; 236; AVX512BW-LABEL: var_shift_v4i16: 237; AVX512BW: # %bb.0: 238; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 239; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 240; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 241; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 242; AVX512BW-NEXT: vzeroupper 243; AVX512BW-NEXT: retq 244; 245; AVX512DQVL-LABEL: var_shift_v4i16: 246; AVX512DQVL: # %bb.0: 247; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 248; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 249; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 250; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 251; AVX512DQVL-NEXT: vzeroupper 252; AVX512DQVL-NEXT: retq 253; 254; AVX512BWVL-LABEL: var_shift_v4i16: 255; AVX512BWVL: # %bb.0: 256; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 257; AVX512BWVL-NEXT: retq 258; 259; X86-SSE-LABEL: var_shift_v4i16: 260; X86-SSE: # %bb.0: 261; X86-SSE-NEXT: psllw $12, %xmm1 262; X86-SSE-NEXT: movdqa %xmm1, %xmm2 263; X86-SSE-NEXT: psraw $15, %xmm2 264; X86-SSE-NEXT: movdqa %xmm2, %xmm3 265; X86-SSE-NEXT: pandn %xmm0, %xmm3 266; X86-SSE-NEXT: psraw $8, %xmm0 267; X86-SSE-NEXT: pand %xmm2, %xmm0 268; X86-SSE-NEXT: por %xmm3, %xmm0 269; X86-SSE-NEXT: paddw %xmm1, %xmm1 270; X86-SSE-NEXT: movdqa %xmm1, %xmm2 271; X86-SSE-NEXT: psraw $15, %xmm2 272; X86-SSE-NEXT: movdqa %xmm2, %xmm3 273; X86-SSE-NEXT: pandn %xmm0, %xmm3 274; X86-SSE-NEXT: psraw $4, %xmm0 275; X86-SSE-NEXT: pand %xmm2, %xmm0 276; X86-SSE-NEXT: por %xmm3, %xmm0 277; X86-SSE-NEXT: paddw %xmm1, %xmm1 278; X86-SSE-NEXT: movdqa %xmm1, %xmm2 279; X86-SSE-NEXT: psraw $15, %xmm2 280; X86-SSE-NEXT: movdqa %xmm2, %xmm3 281; X86-SSE-NEXT: pandn %xmm0, %xmm3 282; X86-SSE-NEXT: psraw $2, %xmm0 283; X86-SSE-NEXT: pand %xmm2, %xmm0 284; X86-SSE-NEXT: por %xmm3, %xmm0 285; X86-SSE-NEXT: paddw %xmm1, %xmm1 286; X86-SSE-NEXT: psraw $15, %xmm1 287; X86-SSE-NEXT: movdqa %xmm1, %xmm2 288; X86-SSE-NEXT: pandn %xmm0, %xmm2 289; X86-SSE-NEXT: psraw $1, %xmm0 290; X86-SSE-NEXT: pand %xmm1, %xmm0 291; X86-SSE-NEXT: por %xmm2, %xmm0 292; X86-SSE-NEXT: retl 293 %shift = ashr <4 x i16> %a, %b 294 ret <4 x i16> %shift 295} 296 297define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { 298; SSE2-LABEL: var_shift_v2i16: 299; SSE2: # %bb.0: 300; SSE2-NEXT: psllw $12, %xmm1 301; SSE2-NEXT: movdqa %xmm1, %xmm2 302; SSE2-NEXT: psraw $15, %xmm2 303; SSE2-NEXT: movdqa %xmm2, %xmm3 304; SSE2-NEXT: pandn %xmm0, %xmm3 305; SSE2-NEXT: psraw $8, %xmm0 306; SSE2-NEXT: pand %xmm2, %xmm0 307; SSE2-NEXT: por %xmm3, %xmm0 308; SSE2-NEXT: paddw %xmm1, %xmm1 309; SSE2-NEXT: movdqa %xmm1, %xmm2 310; SSE2-NEXT: psraw $15, %xmm2 311; SSE2-NEXT: movdqa %xmm2, %xmm3 312; SSE2-NEXT: pandn %xmm0, %xmm3 313; SSE2-NEXT: psraw $4, %xmm0 314; SSE2-NEXT: pand %xmm2, %xmm0 315; SSE2-NEXT: por %xmm3, %xmm0 316; SSE2-NEXT: paddw %xmm1, %xmm1 317; SSE2-NEXT: movdqa %xmm1, %xmm2 318; SSE2-NEXT: psraw $15, %xmm2 319; SSE2-NEXT: movdqa %xmm2, %xmm3 320; SSE2-NEXT: pandn %xmm0, %xmm3 321; SSE2-NEXT: psraw $2, %xmm0 322; SSE2-NEXT: pand %xmm2, %xmm0 323; SSE2-NEXT: por %xmm3, %xmm0 324; SSE2-NEXT: paddw %xmm1, %xmm1 325; SSE2-NEXT: psraw $15, %xmm1 326; SSE2-NEXT: movdqa %xmm1, %xmm2 327; SSE2-NEXT: pandn %xmm0, %xmm2 328; SSE2-NEXT: psraw $1, %xmm0 329; SSE2-NEXT: pand %xmm1, %xmm0 330; SSE2-NEXT: por %xmm2, %xmm0 331; SSE2-NEXT: retq 332; 333; SSE41-LABEL: var_shift_v2i16: 334; SSE41: # %bb.0: 335; SSE41-NEXT: movdqa %xmm0, %xmm2 336; SSE41-NEXT: movdqa %xmm1, %xmm0 337; SSE41-NEXT: psllw $12, %xmm0 338; SSE41-NEXT: psllw $4, %xmm1 339; SSE41-NEXT: por %xmm1, %xmm0 340; SSE41-NEXT: movdqa %xmm0, %xmm1 341; SSE41-NEXT: paddw %xmm0, %xmm1 342; SSE41-NEXT: movdqa %xmm2, %xmm3 343; SSE41-NEXT: psraw $8, %xmm3 344; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 345; SSE41-NEXT: movdqa %xmm2, %xmm3 346; SSE41-NEXT: psraw $4, %xmm3 347; SSE41-NEXT: movdqa %xmm1, %xmm0 348; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 349; SSE41-NEXT: movdqa %xmm2, %xmm3 350; SSE41-NEXT: psraw $2, %xmm3 351; SSE41-NEXT: paddw %xmm1, %xmm1 352; SSE41-NEXT: movdqa %xmm1, %xmm0 353; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 354; SSE41-NEXT: movdqa %xmm2, %xmm3 355; SSE41-NEXT: psraw $1, %xmm3 356; SSE41-NEXT: paddw %xmm1, %xmm1 357; SSE41-NEXT: movdqa %xmm1, %xmm0 358; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 359; SSE41-NEXT: movdqa %xmm2, %xmm0 360; SSE41-NEXT: retq 361; 362; AVX1-LABEL: var_shift_v2i16: 363; AVX1: # %bb.0: 364; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 365; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 366; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 367; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 368; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 369; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 370; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 371; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 372; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 373; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 374; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 375; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 376; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 377; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 378; AVX1-NEXT: retq 379; 380; AVX2-LABEL: var_shift_v2i16: 381; AVX2: # %bb.0: 382; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 383; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 384; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 385; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 386; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 387; AVX2-NEXT: vzeroupper 388; AVX2-NEXT: retq 389; 390; XOP-LABEL: var_shift_v2i16: 391; XOP: # %bb.0: 392; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 393; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 394; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 395; XOP-NEXT: retq 396; 397; AVX512DQ-LABEL: var_shift_v2i16: 398; AVX512DQ: # %bb.0: 399; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 400; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 401; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 402; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 403; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 404; AVX512DQ-NEXT: vzeroupper 405; AVX512DQ-NEXT: retq 406; 407; AVX512BW-LABEL: var_shift_v2i16: 408; AVX512BW: # %bb.0: 409; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 410; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 411; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 412; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 413; AVX512BW-NEXT: vzeroupper 414; AVX512BW-NEXT: retq 415; 416; AVX512DQVL-LABEL: var_shift_v2i16: 417; AVX512DQVL: # %bb.0: 418; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 419; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 420; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 421; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 422; AVX512DQVL-NEXT: vzeroupper 423; AVX512DQVL-NEXT: retq 424; 425; AVX512BWVL-LABEL: var_shift_v2i16: 426; AVX512BWVL: # %bb.0: 427; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 428; AVX512BWVL-NEXT: retq 429; 430; X86-SSE-LABEL: var_shift_v2i16: 431; X86-SSE: # %bb.0: 432; X86-SSE-NEXT: psllw $12, %xmm1 433; X86-SSE-NEXT: movdqa %xmm1, %xmm2 434; X86-SSE-NEXT: psraw $15, %xmm2 435; X86-SSE-NEXT: movdqa %xmm2, %xmm3 436; X86-SSE-NEXT: pandn %xmm0, %xmm3 437; X86-SSE-NEXT: psraw $8, %xmm0 438; X86-SSE-NEXT: pand %xmm2, %xmm0 439; X86-SSE-NEXT: por %xmm3, %xmm0 440; X86-SSE-NEXT: paddw %xmm1, %xmm1 441; X86-SSE-NEXT: movdqa %xmm1, %xmm2 442; X86-SSE-NEXT: psraw $15, %xmm2 443; X86-SSE-NEXT: movdqa %xmm2, %xmm3 444; X86-SSE-NEXT: pandn %xmm0, %xmm3 445; X86-SSE-NEXT: psraw $4, %xmm0 446; X86-SSE-NEXT: pand %xmm2, %xmm0 447; X86-SSE-NEXT: por %xmm3, %xmm0 448; X86-SSE-NEXT: paddw %xmm1, %xmm1 449; X86-SSE-NEXT: movdqa %xmm1, %xmm2 450; X86-SSE-NEXT: psraw $15, %xmm2 451; X86-SSE-NEXT: movdqa %xmm2, %xmm3 452; X86-SSE-NEXT: pandn %xmm0, %xmm3 453; X86-SSE-NEXT: psraw $2, %xmm0 454; X86-SSE-NEXT: pand %xmm2, %xmm0 455; X86-SSE-NEXT: por %xmm3, %xmm0 456; X86-SSE-NEXT: paddw %xmm1, %xmm1 457; X86-SSE-NEXT: psraw $15, %xmm1 458; X86-SSE-NEXT: movdqa %xmm1, %xmm2 459; X86-SSE-NEXT: pandn %xmm0, %xmm2 460; X86-SSE-NEXT: psraw $1, %xmm0 461; X86-SSE-NEXT: pand %xmm1, %xmm0 462; X86-SSE-NEXT: por %xmm2, %xmm0 463; X86-SSE-NEXT: retl 464 %shift = ashr <2 x i16> %a, %b 465 ret <2 x i16> %shift 466} 467 468define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { 469; SSE2-LABEL: var_shift_v8i8: 470; SSE2: # %bb.0: 471; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 472; SSE2-NEXT: psllw $5, %xmm1 473; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 474; SSE2-NEXT: pxor %xmm3, %xmm3 475; SSE2-NEXT: pxor %xmm5, %xmm5 476; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 477; SSE2-NEXT: movdqa %xmm5, %xmm6 478; SSE2-NEXT: pandn %xmm2, %xmm6 479; SSE2-NEXT: psraw $4, %xmm2 480; SSE2-NEXT: pand %xmm5, %xmm2 481; SSE2-NEXT: por %xmm6, %xmm2 482; SSE2-NEXT: paddw %xmm4, %xmm4 483; SSE2-NEXT: pxor %xmm5, %xmm5 484; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 485; SSE2-NEXT: movdqa %xmm5, %xmm6 486; SSE2-NEXT: pandn %xmm2, %xmm6 487; SSE2-NEXT: psraw $2, %xmm2 488; SSE2-NEXT: pand %xmm5, %xmm2 489; SSE2-NEXT: por %xmm6, %xmm2 490; SSE2-NEXT: paddw %xmm4, %xmm4 491; SSE2-NEXT: pxor %xmm5, %xmm5 492; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 493; SSE2-NEXT: movdqa %xmm5, %xmm4 494; SSE2-NEXT: pandn %xmm2, %xmm4 495; SSE2-NEXT: psraw $1, %xmm2 496; SSE2-NEXT: pand %xmm5, %xmm2 497; SSE2-NEXT: por %xmm4, %xmm2 498; SSE2-NEXT: psrlw $8, %xmm2 499; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 500; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 501; SSE2-NEXT: pxor %xmm4, %xmm4 502; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 503; SSE2-NEXT: movdqa %xmm4, %xmm5 504; SSE2-NEXT: pandn %xmm0, %xmm5 505; SSE2-NEXT: psraw $4, %xmm0 506; SSE2-NEXT: pand %xmm4, %xmm0 507; SSE2-NEXT: por %xmm5, %xmm0 508; SSE2-NEXT: paddw %xmm1, %xmm1 509; SSE2-NEXT: pxor %xmm4, %xmm4 510; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 511; SSE2-NEXT: movdqa %xmm4, %xmm5 512; SSE2-NEXT: pandn %xmm0, %xmm5 513; SSE2-NEXT: psraw $2, %xmm0 514; SSE2-NEXT: pand %xmm4, %xmm0 515; SSE2-NEXT: por %xmm5, %xmm0 516; SSE2-NEXT: paddw %xmm1, %xmm1 517; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 518; SSE2-NEXT: movdqa %xmm3, %xmm1 519; SSE2-NEXT: pandn %xmm0, %xmm1 520; SSE2-NEXT: psraw $1, %xmm0 521; SSE2-NEXT: pand %xmm3, %xmm0 522; SSE2-NEXT: por %xmm1, %xmm0 523; SSE2-NEXT: psrlw $8, %xmm0 524; SSE2-NEXT: packuswb %xmm2, %xmm0 525; SSE2-NEXT: retq 526; 527; SSE41-LABEL: var_shift_v8i8: 528; SSE41: # %bb.0: 529; SSE41-NEXT: movdqa %xmm0, %xmm2 530; SSE41-NEXT: psllw $5, %xmm1 531; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 532; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 533; SSE41-NEXT: movdqa %xmm3, %xmm4 534; SSE41-NEXT: psraw $4, %xmm4 535; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 536; SSE41-NEXT: movdqa %xmm3, %xmm4 537; SSE41-NEXT: psraw $2, %xmm4 538; SSE41-NEXT: paddw %xmm0, %xmm0 539; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 540; SSE41-NEXT: movdqa %xmm3, %xmm4 541; SSE41-NEXT: psraw $1, %xmm4 542; SSE41-NEXT: paddw %xmm0, %xmm0 543; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 544; SSE41-NEXT: psrlw $8, %xmm3 545; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 546; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 547; SSE41-NEXT: movdqa %xmm1, %xmm2 548; SSE41-NEXT: psraw $4, %xmm2 549; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 550; SSE41-NEXT: movdqa %xmm1, %xmm2 551; SSE41-NEXT: psraw $2, %xmm2 552; SSE41-NEXT: paddw %xmm0, %xmm0 553; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 554; SSE41-NEXT: movdqa %xmm1, %xmm2 555; SSE41-NEXT: psraw $1, %xmm2 556; SSE41-NEXT: paddw %xmm0, %xmm0 557; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 558; SSE41-NEXT: psrlw $8, %xmm1 559; SSE41-NEXT: packuswb %xmm3, %xmm1 560; SSE41-NEXT: movdqa %xmm1, %xmm0 561; SSE41-NEXT: retq 562; 563; AVX-LABEL: var_shift_v8i8: 564; AVX: # %bb.0: 565; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 566; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 567; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 568; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 569; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 570; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 571; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 572; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 573; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 574; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 575; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 576; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 577; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 578; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 579; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 580; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 581; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 582; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 583; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 584; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 585; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 586; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 587; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 588; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 589; AVX-NEXT: retq 590; 591; XOP-LABEL: var_shift_v8i8: 592; XOP: # %bb.0: 593; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 594; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 595; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 596; XOP-NEXT: retq 597; 598; AVX512DQ-LABEL: var_shift_v8i8: 599; AVX512DQ: # %bb.0: 600; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 601; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 602; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 603; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 604; AVX512DQ-NEXT: vzeroupper 605; AVX512DQ-NEXT: retq 606; 607; AVX512BW-LABEL: var_shift_v8i8: 608; AVX512BW: # %bb.0: 609; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 610; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 611; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 612; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 613; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 614; AVX512BW-NEXT: vzeroupper 615; AVX512BW-NEXT: retq 616; 617; AVX512DQVL-LABEL: var_shift_v8i8: 618; AVX512DQVL: # %bb.0: 619; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 620; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 621; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 622; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 623; AVX512DQVL-NEXT: vzeroupper 624; AVX512DQVL-NEXT: retq 625; 626; AVX512BWVL-LABEL: var_shift_v8i8: 627; AVX512BWVL: # %bb.0: 628; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 629; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 630; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 631; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 632; AVX512BWVL-NEXT: vzeroupper 633; AVX512BWVL-NEXT: retq 634; 635; X86-SSE-LABEL: var_shift_v8i8: 636; X86-SSE: # %bb.0: 637; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 638; X86-SSE-NEXT: psllw $5, %xmm1 639; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 640; X86-SSE-NEXT: pxor %xmm3, %xmm3 641; X86-SSE-NEXT: pxor %xmm5, %xmm5 642; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 643; X86-SSE-NEXT: movdqa %xmm5, %xmm6 644; X86-SSE-NEXT: pandn %xmm2, %xmm6 645; X86-SSE-NEXT: psraw $4, %xmm2 646; X86-SSE-NEXT: pand %xmm5, %xmm2 647; X86-SSE-NEXT: por %xmm6, %xmm2 648; X86-SSE-NEXT: paddw %xmm4, %xmm4 649; X86-SSE-NEXT: pxor %xmm5, %xmm5 650; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 651; X86-SSE-NEXT: movdqa %xmm5, %xmm6 652; X86-SSE-NEXT: pandn %xmm2, %xmm6 653; X86-SSE-NEXT: psraw $2, %xmm2 654; X86-SSE-NEXT: pand %xmm5, %xmm2 655; X86-SSE-NEXT: por %xmm6, %xmm2 656; X86-SSE-NEXT: paddw %xmm4, %xmm4 657; X86-SSE-NEXT: pxor %xmm5, %xmm5 658; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 659; X86-SSE-NEXT: movdqa %xmm5, %xmm4 660; X86-SSE-NEXT: pandn %xmm2, %xmm4 661; X86-SSE-NEXT: psraw $1, %xmm2 662; X86-SSE-NEXT: pand %xmm5, %xmm2 663; X86-SSE-NEXT: por %xmm4, %xmm2 664; X86-SSE-NEXT: psrlw $8, %xmm2 665; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 666; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 667; X86-SSE-NEXT: pxor %xmm4, %xmm4 668; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 669; X86-SSE-NEXT: movdqa %xmm4, %xmm5 670; X86-SSE-NEXT: pandn %xmm0, %xmm5 671; X86-SSE-NEXT: psraw $4, %xmm0 672; X86-SSE-NEXT: pand %xmm4, %xmm0 673; X86-SSE-NEXT: por %xmm5, %xmm0 674; X86-SSE-NEXT: paddw %xmm1, %xmm1 675; X86-SSE-NEXT: pxor %xmm4, %xmm4 676; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 677; X86-SSE-NEXT: movdqa %xmm4, %xmm5 678; X86-SSE-NEXT: pandn %xmm0, %xmm5 679; X86-SSE-NEXT: psraw $2, %xmm0 680; X86-SSE-NEXT: pand %xmm4, %xmm0 681; X86-SSE-NEXT: por %xmm5, %xmm0 682; X86-SSE-NEXT: paddw %xmm1, %xmm1 683; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 684; X86-SSE-NEXT: movdqa %xmm3, %xmm1 685; X86-SSE-NEXT: pandn %xmm0, %xmm1 686; X86-SSE-NEXT: psraw $1, %xmm0 687; X86-SSE-NEXT: pand %xmm3, %xmm0 688; X86-SSE-NEXT: por %xmm1, %xmm0 689; X86-SSE-NEXT: psrlw $8, %xmm0 690; X86-SSE-NEXT: packuswb %xmm2, %xmm0 691; X86-SSE-NEXT: retl 692 %shift = ashr <8 x i8> %a, %b 693 ret <8 x i8> %shift 694} 695 696define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { 697; SSE2-LABEL: var_shift_v4i8: 698; SSE2: # %bb.0: 699; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 700; SSE2-NEXT: psllw $5, %xmm1 701; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 702; SSE2-NEXT: pxor %xmm3, %xmm3 703; SSE2-NEXT: pxor %xmm5, %xmm5 704; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 705; SSE2-NEXT: movdqa %xmm5, %xmm6 706; SSE2-NEXT: pandn %xmm2, %xmm6 707; SSE2-NEXT: psraw $4, %xmm2 708; SSE2-NEXT: pand %xmm5, %xmm2 709; SSE2-NEXT: por %xmm6, %xmm2 710; SSE2-NEXT: paddw %xmm4, %xmm4 711; SSE2-NEXT: pxor %xmm5, %xmm5 712; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 713; SSE2-NEXT: movdqa %xmm5, %xmm6 714; SSE2-NEXT: pandn %xmm2, %xmm6 715; SSE2-NEXT: psraw $2, %xmm2 716; SSE2-NEXT: pand %xmm5, %xmm2 717; SSE2-NEXT: por %xmm6, %xmm2 718; SSE2-NEXT: paddw %xmm4, %xmm4 719; SSE2-NEXT: pxor %xmm5, %xmm5 720; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 721; SSE2-NEXT: movdqa %xmm5, %xmm4 722; SSE2-NEXT: pandn %xmm2, %xmm4 723; SSE2-NEXT: psraw $1, %xmm2 724; SSE2-NEXT: pand %xmm5, %xmm2 725; SSE2-NEXT: por %xmm4, %xmm2 726; SSE2-NEXT: psrlw $8, %xmm2 727; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 728; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 729; SSE2-NEXT: pxor %xmm4, %xmm4 730; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 731; SSE2-NEXT: movdqa %xmm4, %xmm5 732; SSE2-NEXT: pandn %xmm0, %xmm5 733; SSE2-NEXT: psraw $4, %xmm0 734; SSE2-NEXT: pand %xmm4, %xmm0 735; SSE2-NEXT: por %xmm5, %xmm0 736; SSE2-NEXT: paddw %xmm1, %xmm1 737; SSE2-NEXT: pxor %xmm4, %xmm4 738; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 739; SSE2-NEXT: movdqa %xmm4, %xmm5 740; SSE2-NEXT: pandn %xmm0, %xmm5 741; SSE2-NEXT: psraw $2, %xmm0 742; SSE2-NEXT: pand %xmm4, %xmm0 743; SSE2-NEXT: por %xmm5, %xmm0 744; SSE2-NEXT: paddw %xmm1, %xmm1 745; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 746; SSE2-NEXT: movdqa %xmm3, %xmm1 747; SSE2-NEXT: pandn %xmm0, %xmm1 748; SSE2-NEXT: psraw $1, %xmm0 749; SSE2-NEXT: pand %xmm3, %xmm0 750; SSE2-NEXT: por %xmm1, %xmm0 751; SSE2-NEXT: psrlw $8, %xmm0 752; SSE2-NEXT: packuswb %xmm2, %xmm0 753; SSE2-NEXT: retq 754; 755; SSE41-LABEL: var_shift_v4i8: 756; SSE41: # %bb.0: 757; SSE41-NEXT: movdqa %xmm0, %xmm2 758; SSE41-NEXT: psllw $5, %xmm1 759; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 760; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 761; SSE41-NEXT: movdqa %xmm3, %xmm4 762; SSE41-NEXT: psraw $4, %xmm4 763; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 764; SSE41-NEXT: movdqa %xmm3, %xmm4 765; SSE41-NEXT: psraw $2, %xmm4 766; SSE41-NEXT: paddw %xmm0, %xmm0 767; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 768; SSE41-NEXT: movdqa %xmm3, %xmm4 769; SSE41-NEXT: psraw $1, %xmm4 770; SSE41-NEXT: paddw %xmm0, %xmm0 771; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 772; SSE41-NEXT: psrlw $8, %xmm3 773; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 774; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 775; SSE41-NEXT: movdqa %xmm1, %xmm2 776; SSE41-NEXT: psraw $4, %xmm2 777; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 778; SSE41-NEXT: movdqa %xmm1, %xmm2 779; SSE41-NEXT: psraw $2, %xmm2 780; SSE41-NEXT: paddw %xmm0, %xmm0 781; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 782; SSE41-NEXT: movdqa %xmm1, %xmm2 783; SSE41-NEXT: psraw $1, %xmm2 784; SSE41-NEXT: paddw %xmm0, %xmm0 785; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 786; SSE41-NEXT: psrlw $8, %xmm1 787; SSE41-NEXT: packuswb %xmm3, %xmm1 788; SSE41-NEXT: movdqa %xmm1, %xmm0 789; SSE41-NEXT: retq 790; 791; AVX-LABEL: var_shift_v4i8: 792; AVX: # %bb.0: 793; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 794; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 795; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 796; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 797; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 798; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 799; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 800; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 801; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 802; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 803; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 804; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 805; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 806; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 807; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 808; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 809; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 810; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 811; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 812; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 813; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 814; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 815; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 816; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 817; AVX-NEXT: retq 818; 819; XOP-LABEL: var_shift_v4i8: 820; XOP: # %bb.0: 821; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 822; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 823; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 824; XOP-NEXT: retq 825; 826; AVX512DQ-LABEL: var_shift_v4i8: 827; AVX512DQ: # %bb.0: 828; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 829; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 830; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 831; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 832; AVX512DQ-NEXT: vzeroupper 833; AVX512DQ-NEXT: retq 834; 835; AVX512BW-LABEL: var_shift_v4i8: 836; AVX512BW: # %bb.0: 837; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 838; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 839; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 840; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 841; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 842; AVX512BW-NEXT: vzeroupper 843; AVX512BW-NEXT: retq 844; 845; AVX512DQVL-LABEL: var_shift_v4i8: 846; AVX512DQVL: # %bb.0: 847; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 848; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 849; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 850; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 851; AVX512DQVL-NEXT: vzeroupper 852; AVX512DQVL-NEXT: retq 853; 854; AVX512BWVL-LABEL: var_shift_v4i8: 855; AVX512BWVL: # %bb.0: 856; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 857; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 858; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 859; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 860; AVX512BWVL-NEXT: vzeroupper 861; AVX512BWVL-NEXT: retq 862; 863; X86-SSE-LABEL: var_shift_v4i8: 864; X86-SSE: # %bb.0: 865; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 866; X86-SSE-NEXT: psllw $5, %xmm1 867; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 868; X86-SSE-NEXT: pxor %xmm3, %xmm3 869; X86-SSE-NEXT: pxor %xmm5, %xmm5 870; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 871; X86-SSE-NEXT: movdqa %xmm5, %xmm6 872; X86-SSE-NEXT: pandn %xmm2, %xmm6 873; X86-SSE-NEXT: psraw $4, %xmm2 874; X86-SSE-NEXT: pand %xmm5, %xmm2 875; X86-SSE-NEXT: por %xmm6, %xmm2 876; X86-SSE-NEXT: paddw %xmm4, %xmm4 877; X86-SSE-NEXT: pxor %xmm5, %xmm5 878; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 879; X86-SSE-NEXT: movdqa %xmm5, %xmm6 880; X86-SSE-NEXT: pandn %xmm2, %xmm6 881; X86-SSE-NEXT: psraw $2, %xmm2 882; X86-SSE-NEXT: pand %xmm5, %xmm2 883; X86-SSE-NEXT: por %xmm6, %xmm2 884; X86-SSE-NEXT: paddw %xmm4, %xmm4 885; X86-SSE-NEXT: pxor %xmm5, %xmm5 886; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 887; X86-SSE-NEXT: movdqa %xmm5, %xmm4 888; X86-SSE-NEXT: pandn %xmm2, %xmm4 889; X86-SSE-NEXT: psraw $1, %xmm2 890; X86-SSE-NEXT: pand %xmm5, %xmm2 891; X86-SSE-NEXT: por %xmm4, %xmm2 892; X86-SSE-NEXT: psrlw $8, %xmm2 893; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 894; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 895; X86-SSE-NEXT: pxor %xmm4, %xmm4 896; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 897; X86-SSE-NEXT: movdqa %xmm4, %xmm5 898; X86-SSE-NEXT: pandn %xmm0, %xmm5 899; X86-SSE-NEXT: psraw $4, %xmm0 900; X86-SSE-NEXT: pand %xmm4, %xmm0 901; X86-SSE-NEXT: por %xmm5, %xmm0 902; X86-SSE-NEXT: paddw %xmm1, %xmm1 903; X86-SSE-NEXT: pxor %xmm4, %xmm4 904; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 905; X86-SSE-NEXT: movdqa %xmm4, %xmm5 906; X86-SSE-NEXT: pandn %xmm0, %xmm5 907; X86-SSE-NEXT: psraw $2, %xmm0 908; X86-SSE-NEXT: pand %xmm4, %xmm0 909; X86-SSE-NEXT: por %xmm5, %xmm0 910; X86-SSE-NEXT: paddw %xmm1, %xmm1 911; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 912; X86-SSE-NEXT: movdqa %xmm3, %xmm1 913; X86-SSE-NEXT: pandn %xmm0, %xmm1 914; X86-SSE-NEXT: psraw $1, %xmm0 915; X86-SSE-NEXT: pand %xmm3, %xmm0 916; X86-SSE-NEXT: por %xmm1, %xmm0 917; X86-SSE-NEXT: psrlw $8, %xmm0 918; X86-SSE-NEXT: packuswb %xmm2, %xmm0 919; X86-SSE-NEXT: retl 920 %shift = ashr <4 x i8> %a, %b 921 ret <4 x i8> %shift 922} 923 924define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { 925; SSE2-LABEL: var_shift_v2i8: 926; SSE2: # %bb.0: 927; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 928; SSE2-NEXT: psllw $5, %xmm1 929; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 930; SSE2-NEXT: pxor %xmm3, %xmm3 931; SSE2-NEXT: pxor %xmm5, %xmm5 932; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 933; SSE2-NEXT: movdqa %xmm5, %xmm6 934; SSE2-NEXT: pandn %xmm2, %xmm6 935; SSE2-NEXT: psraw $4, %xmm2 936; SSE2-NEXT: pand %xmm5, %xmm2 937; SSE2-NEXT: por %xmm6, %xmm2 938; SSE2-NEXT: paddw %xmm4, %xmm4 939; SSE2-NEXT: pxor %xmm5, %xmm5 940; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 941; SSE2-NEXT: movdqa %xmm5, %xmm6 942; SSE2-NEXT: pandn %xmm2, %xmm6 943; SSE2-NEXT: psraw $2, %xmm2 944; SSE2-NEXT: pand %xmm5, %xmm2 945; SSE2-NEXT: por %xmm6, %xmm2 946; SSE2-NEXT: paddw %xmm4, %xmm4 947; SSE2-NEXT: pxor %xmm5, %xmm5 948; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 949; SSE2-NEXT: movdqa %xmm5, %xmm4 950; SSE2-NEXT: pandn %xmm2, %xmm4 951; SSE2-NEXT: psraw $1, %xmm2 952; SSE2-NEXT: pand %xmm5, %xmm2 953; SSE2-NEXT: por %xmm4, %xmm2 954; SSE2-NEXT: psrlw $8, %xmm2 955; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 956; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 957; SSE2-NEXT: pxor %xmm4, %xmm4 958; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 959; SSE2-NEXT: movdqa %xmm4, %xmm5 960; SSE2-NEXT: pandn %xmm0, %xmm5 961; SSE2-NEXT: psraw $4, %xmm0 962; SSE2-NEXT: pand %xmm4, %xmm0 963; SSE2-NEXT: por %xmm5, %xmm0 964; SSE2-NEXT: paddw %xmm1, %xmm1 965; SSE2-NEXT: pxor %xmm4, %xmm4 966; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 967; SSE2-NEXT: movdqa %xmm4, %xmm5 968; SSE2-NEXT: pandn %xmm0, %xmm5 969; SSE2-NEXT: psraw $2, %xmm0 970; SSE2-NEXT: pand %xmm4, %xmm0 971; SSE2-NEXT: por %xmm5, %xmm0 972; SSE2-NEXT: paddw %xmm1, %xmm1 973; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 974; SSE2-NEXT: movdqa %xmm3, %xmm1 975; SSE2-NEXT: pandn %xmm0, %xmm1 976; SSE2-NEXT: psraw $1, %xmm0 977; SSE2-NEXT: pand %xmm3, %xmm0 978; SSE2-NEXT: por %xmm1, %xmm0 979; SSE2-NEXT: psrlw $8, %xmm0 980; SSE2-NEXT: packuswb %xmm2, %xmm0 981; SSE2-NEXT: retq 982; 983; SSE41-LABEL: var_shift_v2i8: 984; SSE41: # %bb.0: 985; SSE41-NEXT: movdqa %xmm0, %xmm2 986; SSE41-NEXT: psllw $5, %xmm1 987; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 988; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 989; SSE41-NEXT: movdqa %xmm3, %xmm4 990; SSE41-NEXT: psraw $4, %xmm4 991; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 992; SSE41-NEXT: movdqa %xmm3, %xmm4 993; SSE41-NEXT: psraw $2, %xmm4 994; SSE41-NEXT: paddw %xmm0, %xmm0 995; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 996; SSE41-NEXT: movdqa %xmm3, %xmm4 997; SSE41-NEXT: psraw $1, %xmm4 998; SSE41-NEXT: paddw %xmm0, %xmm0 999; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 1000; SSE41-NEXT: psrlw $8, %xmm3 1001; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1002; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1003; SSE41-NEXT: movdqa %xmm1, %xmm2 1004; SSE41-NEXT: psraw $4, %xmm2 1005; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1006; SSE41-NEXT: movdqa %xmm1, %xmm2 1007; SSE41-NEXT: psraw $2, %xmm2 1008; SSE41-NEXT: paddw %xmm0, %xmm0 1009; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1010; SSE41-NEXT: movdqa %xmm1, %xmm2 1011; SSE41-NEXT: psraw $1, %xmm2 1012; SSE41-NEXT: paddw %xmm0, %xmm0 1013; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1014; SSE41-NEXT: psrlw $8, %xmm1 1015; SSE41-NEXT: packuswb %xmm3, %xmm1 1016; SSE41-NEXT: movdqa %xmm1, %xmm0 1017; SSE41-NEXT: retq 1018; 1019; AVX-LABEL: var_shift_v2i8: 1020; AVX: # %bb.0: 1021; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1022; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1023; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1024; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 1025; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1026; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 1027; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1028; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1029; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 1030; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1031; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 1032; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 1033; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1034; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1035; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 1036; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1037; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 1038; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1039; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1040; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 1041; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1042; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1043; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1044; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1045; AVX-NEXT: retq 1046; 1047; XOP-LABEL: var_shift_v2i8: 1048; XOP: # %bb.0: 1049; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1050; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1051; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1052; XOP-NEXT: retq 1053; 1054; AVX512DQ-LABEL: var_shift_v2i8: 1055; AVX512DQ: # %bb.0: 1056; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1057; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1058; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 1059; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1060; AVX512DQ-NEXT: vzeroupper 1061; AVX512DQ-NEXT: retq 1062; 1063; AVX512BW-LABEL: var_shift_v2i8: 1064; AVX512BW: # %bb.0: 1065; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1066; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1067; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1068; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1069; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1070; AVX512BW-NEXT: vzeroupper 1071; AVX512BW-NEXT: retq 1072; 1073; AVX512DQVL-LABEL: var_shift_v2i8: 1074; AVX512DQVL: # %bb.0: 1075; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1076; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1077; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 1078; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1079; AVX512DQVL-NEXT: vzeroupper 1080; AVX512DQVL-NEXT: retq 1081; 1082; AVX512BWVL-LABEL: var_shift_v2i8: 1083; AVX512BWVL: # %bb.0: 1084; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1085; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1086; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 1087; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1088; AVX512BWVL-NEXT: vzeroupper 1089; AVX512BWVL-NEXT: retq 1090; 1091; X86-SSE-LABEL: var_shift_v2i8: 1092; X86-SSE: # %bb.0: 1093; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1094; X86-SSE-NEXT: psllw $5, %xmm1 1095; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 1096; X86-SSE-NEXT: pxor %xmm3, %xmm3 1097; X86-SSE-NEXT: pxor %xmm5, %xmm5 1098; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1099; X86-SSE-NEXT: movdqa %xmm5, %xmm6 1100; X86-SSE-NEXT: pandn %xmm2, %xmm6 1101; X86-SSE-NEXT: psraw $4, %xmm2 1102; X86-SSE-NEXT: pand %xmm5, %xmm2 1103; X86-SSE-NEXT: por %xmm6, %xmm2 1104; X86-SSE-NEXT: paddw %xmm4, %xmm4 1105; X86-SSE-NEXT: pxor %xmm5, %xmm5 1106; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1107; X86-SSE-NEXT: movdqa %xmm5, %xmm6 1108; X86-SSE-NEXT: pandn %xmm2, %xmm6 1109; X86-SSE-NEXT: psraw $2, %xmm2 1110; X86-SSE-NEXT: pand %xmm5, %xmm2 1111; X86-SSE-NEXT: por %xmm6, %xmm2 1112; X86-SSE-NEXT: paddw %xmm4, %xmm4 1113; X86-SSE-NEXT: pxor %xmm5, %xmm5 1114; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1115; X86-SSE-NEXT: movdqa %xmm5, %xmm4 1116; X86-SSE-NEXT: pandn %xmm2, %xmm4 1117; X86-SSE-NEXT: psraw $1, %xmm2 1118; X86-SSE-NEXT: pand %xmm5, %xmm2 1119; X86-SSE-NEXT: por %xmm4, %xmm2 1120; X86-SSE-NEXT: psrlw $8, %xmm2 1121; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1122; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1123; X86-SSE-NEXT: pxor %xmm4, %xmm4 1124; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 1125; X86-SSE-NEXT: movdqa %xmm4, %xmm5 1126; X86-SSE-NEXT: pandn %xmm0, %xmm5 1127; X86-SSE-NEXT: psraw $4, %xmm0 1128; X86-SSE-NEXT: pand %xmm4, %xmm0 1129; X86-SSE-NEXT: por %xmm5, %xmm0 1130; X86-SSE-NEXT: paddw %xmm1, %xmm1 1131; X86-SSE-NEXT: pxor %xmm4, %xmm4 1132; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 1133; X86-SSE-NEXT: movdqa %xmm4, %xmm5 1134; X86-SSE-NEXT: pandn %xmm0, %xmm5 1135; X86-SSE-NEXT: psraw $2, %xmm0 1136; X86-SSE-NEXT: pand %xmm4, %xmm0 1137; X86-SSE-NEXT: por %xmm5, %xmm0 1138; X86-SSE-NEXT: paddw %xmm1, %xmm1 1139; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 1140; X86-SSE-NEXT: movdqa %xmm3, %xmm1 1141; X86-SSE-NEXT: pandn %xmm0, %xmm1 1142; X86-SSE-NEXT: psraw $1, %xmm0 1143; X86-SSE-NEXT: pand %xmm3, %xmm0 1144; X86-SSE-NEXT: por %xmm1, %xmm0 1145; X86-SSE-NEXT: psrlw $8, %xmm0 1146; X86-SSE-NEXT: packuswb %xmm2, %xmm0 1147; X86-SSE-NEXT: retl 1148 %shift = ashr <2 x i8> %a, %b 1149 ret <2 x i8> %shift 1150} 1151 1152; 1153; Uniform Variable Shifts 1154; 1155 1156define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { 1157; SSE2-LABEL: splatvar_shift_v2i32: 1158; SSE2: # %bb.0: 1159; SSE2-NEXT: xorps %xmm2, %xmm2 1160; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1161; SSE2-NEXT: psrad %xmm2, %xmm0 1162; SSE2-NEXT: retq 1163; 1164; SSE41-LABEL: splatvar_shift_v2i32: 1165; SSE41: # %bb.0: 1166; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1167; SSE41-NEXT: psrad %xmm1, %xmm0 1168; SSE41-NEXT: retq 1169; 1170; AVX-LABEL: splatvar_shift_v2i32: 1171; AVX: # %bb.0: 1172; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1173; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1174; AVX-NEXT: retq 1175; 1176; XOP-LABEL: splatvar_shift_v2i32: 1177; XOP: # %bb.0: 1178; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1179; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1180; XOP-NEXT: retq 1181; 1182; AVX512-LABEL: splatvar_shift_v2i32: 1183; AVX512: # %bb.0: 1184; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1185; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1186; AVX512-NEXT: retq 1187; 1188; AVX512VL-LABEL: splatvar_shift_v2i32: 1189; AVX512VL: # %bb.0: 1190; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1191; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1192; AVX512VL-NEXT: retq 1193; 1194; X86-SSE-LABEL: splatvar_shift_v2i32: 1195; X86-SSE: # %bb.0: 1196; X86-SSE-NEXT: xorps %xmm2, %xmm2 1197; X86-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1198; X86-SSE-NEXT: psrad %xmm2, %xmm0 1199; X86-SSE-NEXT: retl 1200 %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer 1201 %shift = ashr <2 x i32> %a, %splat 1202 ret <2 x i32> %shift 1203} 1204 1205define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 1206; SSE2-LABEL: splatvar_shift_v4i16: 1207; SSE2: # %bb.0: 1208; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1209; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1210; SSE2-NEXT: psraw %xmm1, %xmm0 1211; SSE2-NEXT: retq 1212; 1213; SSE41-LABEL: splatvar_shift_v4i16: 1214; SSE41: # %bb.0: 1215; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1216; SSE41-NEXT: psraw %xmm1, %xmm0 1217; SSE41-NEXT: retq 1218; 1219; AVX-LABEL: splatvar_shift_v4i16: 1220; AVX: # %bb.0: 1221; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1222; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1223; AVX-NEXT: retq 1224; 1225; XOP-LABEL: splatvar_shift_v4i16: 1226; XOP: # %bb.0: 1227; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1228; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1229; XOP-NEXT: retq 1230; 1231; AVX512-LABEL: splatvar_shift_v4i16: 1232; AVX512: # %bb.0: 1233; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1234; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1235; AVX512-NEXT: retq 1236; 1237; AVX512VL-LABEL: splatvar_shift_v4i16: 1238; AVX512VL: # %bb.0: 1239; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1240; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1241; AVX512VL-NEXT: retq 1242; 1243; X86-SSE-LABEL: splatvar_shift_v4i16: 1244; X86-SSE: # %bb.0: 1245; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1246; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1247; X86-SSE-NEXT: psraw %xmm1, %xmm0 1248; X86-SSE-NEXT: retl 1249 %splat = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer 1250 %shift = ashr <4 x i16> %a, %splat 1251 ret <4 x i16> %shift 1252} 1253 1254define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { 1255; SSE2-LABEL: splatvar_shift_v2i16: 1256; SSE2: # %bb.0: 1257; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1258; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1259; SSE2-NEXT: psraw %xmm1, %xmm0 1260; SSE2-NEXT: retq 1261; 1262; SSE41-LABEL: splatvar_shift_v2i16: 1263; SSE41: # %bb.0: 1264; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1265; SSE41-NEXT: psraw %xmm1, %xmm0 1266; SSE41-NEXT: retq 1267; 1268; AVX-LABEL: splatvar_shift_v2i16: 1269; AVX: # %bb.0: 1270; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1271; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1272; AVX-NEXT: retq 1273; 1274; XOP-LABEL: splatvar_shift_v2i16: 1275; XOP: # %bb.0: 1276; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1277; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1278; XOP-NEXT: retq 1279; 1280; AVX512-LABEL: splatvar_shift_v2i16: 1281; AVX512: # %bb.0: 1282; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1283; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1284; AVX512-NEXT: retq 1285; 1286; AVX512VL-LABEL: splatvar_shift_v2i16: 1287; AVX512VL: # %bb.0: 1288; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1289; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1290; AVX512VL-NEXT: retq 1291; 1292; X86-SSE-LABEL: splatvar_shift_v2i16: 1293; X86-SSE: # %bb.0: 1294; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1295; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1296; X86-SSE-NEXT: psraw %xmm1, %xmm0 1297; X86-SSE-NEXT: retl 1298 %splat = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer 1299 %shift = ashr <2 x i16> %a, %splat 1300 ret <2 x i16> %shift 1301} 1302 1303define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { 1304; SSE2-LABEL: splatvar_shift_v8i8: 1305; SSE2: # %bb.0: 1306; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1307; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1308; SSE2-NEXT: psrlw %xmm1, %xmm0 1309; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1310; SSE2-NEXT: psrlw %xmm1, %xmm2 1311; SSE2-NEXT: psrlw $8, %xmm2 1312; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1313; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1314; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1315; SSE2-NEXT: pand %xmm2, %xmm0 1316; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1317; SSE2-NEXT: psrlw %xmm1, %xmm2 1318; SSE2-NEXT: pxor %xmm2, %xmm0 1319; SSE2-NEXT: psubb %xmm2, %xmm0 1320; SSE2-NEXT: retq 1321; 1322; SSE41-LABEL: splatvar_shift_v8i8: 1323; SSE41: # %bb.0: 1324; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1325; SSE41-NEXT: psrlw %xmm1, %xmm0 1326; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1327; SSE41-NEXT: psrlw %xmm1, %xmm2 1328; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1329; SSE41-NEXT: pand %xmm2, %xmm0 1330; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1331; SSE41-NEXT: psrlw %xmm1, %xmm2 1332; SSE41-NEXT: pxor %xmm2, %xmm0 1333; SSE41-NEXT: psubb %xmm2, %xmm0 1334; SSE41-NEXT: retq 1335; 1336; AVX1-LABEL: splatvar_shift_v8i8: 1337; AVX1: # %bb.0: 1338; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1339; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1340; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1341; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1342; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1343; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1344; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1345; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1346; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1347; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1348; AVX1-NEXT: retq 1349; 1350; AVX2-LABEL: splatvar_shift_v8i8: 1351; AVX2: # %bb.0: 1352; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1353; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1354; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1355; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1356; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1357; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1358; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1359; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1360; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1361; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1362; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1363; AVX2-NEXT: retq 1364; 1365; XOPAVX1-LABEL: splatvar_shift_v8i8: 1366; XOPAVX1: # %bb.0: 1367; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1368; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1369; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1370; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1371; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1372; XOPAVX1-NEXT: retq 1373; 1374; XOPAVX2-LABEL: splatvar_shift_v8i8: 1375; XOPAVX2: # %bb.0: 1376; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1377; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1378; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1379; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1380; XOPAVX2-NEXT: retq 1381; 1382; AVX512DQ-LABEL: splatvar_shift_v8i8: 1383; AVX512DQ: # %bb.0: 1384; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1385; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1386; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1387; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1388; AVX512DQ-NEXT: vzeroupper 1389; AVX512DQ-NEXT: retq 1390; 1391; AVX512BW-LABEL: splatvar_shift_v8i8: 1392; AVX512BW: # %bb.0: 1393; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1394; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1395; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1396; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1397; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1398; AVX512BW-NEXT: vzeroupper 1399; AVX512BW-NEXT: retq 1400; 1401; AVX512DQVL-LABEL: splatvar_shift_v8i8: 1402; AVX512DQVL: # %bb.0: 1403; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1404; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1405; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1406; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1407; AVX512DQVL-NEXT: vzeroupper 1408; AVX512DQVL-NEXT: retq 1409; 1410; AVX512BWVL-LABEL: splatvar_shift_v8i8: 1411; AVX512BWVL: # %bb.0: 1412; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1413; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1414; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1415; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1416; AVX512BWVL-NEXT: vzeroupper 1417; AVX512BWVL-NEXT: retq 1418; 1419; X86-SSE-LABEL: splatvar_shift_v8i8: 1420; X86-SSE: # %bb.0: 1421; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1422; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1423; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1424; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1425; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1426; X86-SSE-NEXT: psrlw $8, %xmm2 1427; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1428; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1429; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1430; X86-SSE-NEXT: pand %xmm2, %xmm0 1431; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1432; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1433; X86-SSE-NEXT: pxor %xmm2, %xmm0 1434; X86-SSE-NEXT: psubb %xmm2, %xmm0 1435; X86-SSE-NEXT: retl 1436 %splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer 1437 %shift = ashr <8 x i8> %a, %splat 1438 ret <8 x i8> %shift 1439} 1440 1441define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { 1442; SSE2-LABEL: splatvar_shift_v4i8: 1443; SSE2: # %bb.0: 1444; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1445; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1446; SSE2-NEXT: psrlw %xmm1, %xmm0 1447; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1448; SSE2-NEXT: psrlw %xmm1, %xmm2 1449; SSE2-NEXT: psrlw $8, %xmm2 1450; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1451; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1452; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1453; SSE2-NEXT: pand %xmm2, %xmm0 1454; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1455; SSE2-NEXT: psrlw %xmm1, %xmm2 1456; SSE2-NEXT: pxor %xmm2, %xmm0 1457; SSE2-NEXT: psubb %xmm2, %xmm0 1458; SSE2-NEXT: retq 1459; 1460; SSE41-LABEL: splatvar_shift_v4i8: 1461; SSE41: # %bb.0: 1462; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1463; SSE41-NEXT: psrlw %xmm1, %xmm0 1464; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1465; SSE41-NEXT: psrlw %xmm1, %xmm2 1466; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1467; SSE41-NEXT: pand %xmm2, %xmm0 1468; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1469; SSE41-NEXT: psrlw %xmm1, %xmm2 1470; SSE41-NEXT: pxor %xmm2, %xmm0 1471; SSE41-NEXT: psubb %xmm2, %xmm0 1472; SSE41-NEXT: retq 1473; 1474; AVX1-LABEL: splatvar_shift_v4i8: 1475; AVX1: # %bb.0: 1476; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1477; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1478; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1479; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1480; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1481; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1482; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1483; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1484; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1485; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1486; AVX1-NEXT: retq 1487; 1488; AVX2-LABEL: splatvar_shift_v4i8: 1489; AVX2: # %bb.0: 1490; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1491; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1492; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1493; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1494; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1495; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1496; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1497; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1498; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1499; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1500; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1501; AVX2-NEXT: retq 1502; 1503; XOPAVX1-LABEL: splatvar_shift_v4i8: 1504; XOPAVX1: # %bb.0: 1505; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1506; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1507; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1508; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1509; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1510; XOPAVX1-NEXT: retq 1511; 1512; XOPAVX2-LABEL: splatvar_shift_v4i8: 1513; XOPAVX2: # %bb.0: 1514; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1515; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1516; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1517; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1518; XOPAVX2-NEXT: retq 1519; 1520; AVX512DQ-LABEL: splatvar_shift_v4i8: 1521; AVX512DQ: # %bb.0: 1522; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1523; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1524; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1525; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1526; AVX512DQ-NEXT: vzeroupper 1527; AVX512DQ-NEXT: retq 1528; 1529; AVX512BW-LABEL: splatvar_shift_v4i8: 1530; AVX512BW: # %bb.0: 1531; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1532; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1533; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1534; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1535; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1536; AVX512BW-NEXT: vzeroupper 1537; AVX512BW-NEXT: retq 1538; 1539; AVX512DQVL-LABEL: splatvar_shift_v4i8: 1540; AVX512DQVL: # %bb.0: 1541; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1542; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1543; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1544; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1545; AVX512DQVL-NEXT: vzeroupper 1546; AVX512DQVL-NEXT: retq 1547; 1548; AVX512BWVL-LABEL: splatvar_shift_v4i8: 1549; AVX512BWVL: # %bb.0: 1550; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1551; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1552; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1553; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1554; AVX512BWVL-NEXT: vzeroupper 1555; AVX512BWVL-NEXT: retq 1556; 1557; X86-SSE-LABEL: splatvar_shift_v4i8: 1558; X86-SSE: # %bb.0: 1559; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1560; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1561; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1562; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1563; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1564; X86-SSE-NEXT: psrlw $8, %xmm2 1565; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1566; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1567; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1568; X86-SSE-NEXT: pand %xmm2, %xmm0 1569; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1570; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1571; X86-SSE-NEXT: pxor %xmm2, %xmm0 1572; X86-SSE-NEXT: psubb %xmm2, %xmm0 1573; X86-SSE-NEXT: retl 1574 %splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer 1575 %shift = ashr <4 x i8> %a, %splat 1576 ret <4 x i8> %shift 1577} 1578 1579define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { 1580; SSE2-LABEL: splatvar_shift_v2i8: 1581; SSE2: # %bb.0: 1582; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1583; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1584; SSE2-NEXT: psrlw %xmm1, %xmm0 1585; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1586; SSE2-NEXT: psrlw %xmm1, %xmm2 1587; SSE2-NEXT: psrlw $8, %xmm2 1588; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1589; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1590; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1591; SSE2-NEXT: pand %xmm2, %xmm0 1592; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1593; SSE2-NEXT: psrlw %xmm1, %xmm2 1594; SSE2-NEXT: pxor %xmm2, %xmm0 1595; SSE2-NEXT: psubb %xmm2, %xmm0 1596; SSE2-NEXT: retq 1597; 1598; SSE41-LABEL: splatvar_shift_v2i8: 1599; SSE41: # %bb.0: 1600; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1601; SSE41-NEXT: psrlw %xmm1, %xmm0 1602; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1603; SSE41-NEXT: psrlw %xmm1, %xmm2 1604; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1605; SSE41-NEXT: pand %xmm2, %xmm0 1606; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1607; SSE41-NEXT: psrlw %xmm1, %xmm2 1608; SSE41-NEXT: pxor %xmm2, %xmm0 1609; SSE41-NEXT: psubb %xmm2, %xmm0 1610; SSE41-NEXT: retq 1611; 1612; AVX1-LABEL: splatvar_shift_v2i8: 1613; AVX1: # %bb.0: 1614; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1615; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1616; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1617; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1618; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1619; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1620; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1621; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1622; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1623; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1624; AVX1-NEXT: retq 1625; 1626; AVX2-LABEL: splatvar_shift_v2i8: 1627; AVX2: # %bb.0: 1628; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1629; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1630; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1631; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1632; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1633; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1634; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1635; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1636; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1637; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1638; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1639; AVX2-NEXT: retq 1640; 1641; XOP-LABEL: splatvar_shift_v2i8: 1642; XOP: # %bb.0: 1643; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] 1644; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1645; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1646; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1647; XOP-NEXT: retq 1648; 1649; AVX512DQ-LABEL: splatvar_shift_v2i8: 1650; AVX512DQ: # %bb.0: 1651; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1652; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1653; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1654; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1655; AVX512DQ-NEXT: vzeroupper 1656; AVX512DQ-NEXT: retq 1657; 1658; AVX512BW-LABEL: splatvar_shift_v2i8: 1659; AVX512BW: # %bb.0: 1660; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1661; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1662; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1663; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1664; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1665; AVX512BW-NEXT: vzeroupper 1666; AVX512BW-NEXT: retq 1667; 1668; AVX512DQVL-LABEL: splatvar_shift_v2i8: 1669; AVX512DQVL: # %bb.0: 1670; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1671; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1672; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1673; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1674; AVX512DQVL-NEXT: vzeroupper 1675; AVX512DQVL-NEXT: retq 1676; 1677; AVX512BWVL-LABEL: splatvar_shift_v2i8: 1678; AVX512BWVL: # %bb.0: 1679; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1680; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1681; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1682; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1683; AVX512BWVL-NEXT: vzeroupper 1684; AVX512BWVL-NEXT: retq 1685; 1686; X86-SSE-LABEL: splatvar_shift_v2i8: 1687; X86-SSE: # %bb.0: 1688; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1689; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1690; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1691; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1692; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1693; X86-SSE-NEXT: psrlw $8, %xmm2 1694; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1695; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1696; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1697; X86-SSE-NEXT: pand %xmm2, %xmm0 1698; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1699; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1700; X86-SSE-NEXT: pxor %xmm2, %xmm0 1701; X86-SSE-NEXT: psubb %xmm2, %xmm0 1702; X86-SSE-NEXT: retl 1703 %splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer 1704 %shift = ashr <2 x i8> %a, %splat 1705 ret <2 x i8> %shift 1706} 1707 1708; 1709; Constant Shifts 1710; 1711 1712define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { 1713; SSE2-LABEL: constant_shift_v2i32: 1714; SSE2: # %bb.0: 1715; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1716; SSE2-NEXT: psrad $4, %xmm0 1717; SSE2-NEXT: psrad $5, %xmm1 1718; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1719; SSE2-NEXT: retq 1720; 1721; SSE41-LABEL: constant_shift_v2i32: 1722; SSE41: # %bb.0: 1723; SSE41-NEXT: movdqa %xmm0, %xmm1 1724; SSE41-NEXT: psrad $5, %xmm1 1725; SSE41-NEXT: psrad $4, %xmm0 1726; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1727; SSE41-NEXT: retq 1728; 1729; AVX1-LABEL: constant_shift_v2i32: 1730; AVX1: # %bb.0: 1731; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 1732; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 1733; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1734; AVX1-NEXT: retq 1735; 1736; AVX2-LABEL: constant_shift_v2i32: 1737; AVX2: # %bb.0: 1738; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1739; AVX2-NEXT: retq 1740; 1741; XOPAVX1-LABEL: constant_shift_v2i32: 1742; XOPAVX1: # %bb.0: 1743; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1744; XOPAVX1-NEXT: retq 1745; 1746; XOPAVX2-LABEL: constant_shift_v2i32: 1747; XOPAVX2: # %bb.0: 1748; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1749; XOPAVX2-NEXT: retq 1750; 1751; AVX512-LABEL: constant_shift_v2i32: 1752; AVX512: # %bb.0: 1753; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1754; AVX512-NEXT: retq 1755; 1756; AVX512VL-LABEL: constant_shift_v2i32: 1757; AVX512VL: # %bb.0: 1758; AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1759; AVX512VL-NEXT: retq 1760; 1761; X86-SSE-LABEL: constant_shift_v2i32: 1762; X86-SSE: # %bb.0: 1763; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1764; X86-SSE-NEXT: psrad $4, %xmm0 1765; X86-SSE-NEXT: psrad $5, %xmm1 1766; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1767; X86-SSE-NEXT: retl 1768 %shift = ashr <2 x i32> %a, <i32 4, i32 5> 1769 ret <2 x i32> %shift 1770} 1771 1772define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { 1773; SSE2-LABEL: constant_shift_v4i16: 1774; SSE2: # %bb.0: 1775; SSE2-NEXT: movdqa %xmm0, %xmm1 1776; SSE2-NEXT: psraw $2, %xmm1 1777; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 1778; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 1779; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] 1780; SSE2-NEXT: movaps %xmm1, %xmm2 1781; SSE2-NEXT: andps %xmm0, %xmm2 1782; SSE2-NEXT: psraw $1, %xmm1 1783; SSE2-NEXT: andnps %xmm1, %xmm0 1784; SSE2-NEXT: orps %xmm2, %xmm0 1785; SSE2-NEXT: retq 1786; 1787; SSE41-LABEL: constant_shift_v4i16: 1788; SSE41: # %bb.0: 1789; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0] 1790; SSE41-NEXT: pmulhw %xmm0, %xmm1 1791; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1792; SSE41-NEXT: psraw $1, %xmm0 1793; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1794; SSE41-NEXT: retq 1795; 1796; AVX-LABEL: constant_shift_v4i16: 1797; AVX: # %bb.0: 1798; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,16384,8192,u,u,u,u] 1799; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1800; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 1801; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1802; AVX-NEXT: retq 1803; 1804; XOP-LABEL: constant_shift_v4i16: 1805; XOP: # %bb.0: 1806; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1807; XOP-NEXT: retq 1808; 1809; AVX512DQ-LABEL: constant_shift_v4i16: 1810; AVX512DQ: # %bb.0: 1811; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 1812; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1813; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1814; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1815; AVX512DQ-NEXT: vzeroupper 1816; AVX512DQ-NEXT: retq 1817; 1818; AVX512BW-LABEL: constant_shift_v4i16: 1819; AVX512BW: # %bb.0: 1820; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1821; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] 1822; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1823; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1824; AVX512BW-NEXT: vzeroupper 1825; AVX512BW-NEXT: retq 1826; 1827; AVX512DQVL-LABEL: constant_shift_v4i16: 1828; AVX512DQVL: # %bb.0: 1829; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 1830; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1831; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 1832; AVX512DQVL-NEXT: vzeroupper 1833; AVX512DQVL-NEXT: retq 1834; 1835; AVX512BWVL-LABEL: constant_shift_v4i16: 1836; AVX512BWVL: # %bb.0: 1837; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1838; AVX512BWVL-NEXT: retq 1839; 1840; X86-SSE-LABEL: constant_shift_v4i16: 1841; X86-SSE: # %bb.0: 1842; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1843; X86-SSE-NEXT: psraw $2, %xmm1 1844; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 1845; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 1846; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] 1847; X86-SSE-NEXT: movaps %xmm1, %xmm2 1848; X86-SSE-NEXT: andps %xmm0, %xmm2 1849; X86-SSE-NEXT: psraw $1, %xmm1 1850; X86-SSE-NEXT: andnps %xmm1, %xmm0 1851; X86-SSE-NEXT: orps %xmm2, %xmm0 1852; X86-SSE-NEXT: retl 1853 %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3> 1854 ret <4 x i16> %shift 1855} 1856 1857define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { 1858; SSE2-LABEL: constant_shift_v2i16: 1859; SSE2: # %bb.0: 1860; SSE2-NEXT: movdqa %xmm0, %xmm1 1861; SSE2-NEXT: psraw $3, %xmm1 1862; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] 1863; SSE2-NEXT: psraw $2, %xmm0 1864; SSE2-NEXT: pand %xmm2, %xmm0 1865; SSE2-NEXT: pandn %xmm1, %xmm2 1866; SSE2-NEXT: por %xmm2, %xmm0 1867; SSE2-NEXT: retq 1868; 1869; SSE41-LABEL: constant_shift_v2i16: 1870; SSE41: # %bb.0: 1871; SSE41-NEXT: movdqa %xmm0, %xmm1 1872; SSE41-NEXT: psraw $3, %xmm1 1873; SSE41-NEXT: psraw $2, %xmm0 1874; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1875; SSE41-NEXT: retq 1876; 1877; AVX-LABEL: constant_shift_v2i16: 1878; AVX: # %bb.0: 1879; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 1880; AVX-NEXT: vpsraw $2, %xmm0, %xmm0 1881; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1882; AVX-NEXT: retq 1883; 1884; XOP-LABEL: constant_shift_v2i16: 1885; XOP: # %bb.0: 1886; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1887; XOP-NEXT: retq 1888; 1889; AVX512DQ-LABEL: constant_shift_v2i16: 1890; AVX512DQ: # %bb.0: 1891; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1 1892; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0 1893; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1894; AVX512DQ-NEXT: retq 1895; 1896; AVX512BW-LABEL: constant_shift_v2i16: 1897; AVX512BW: # %bb.0: 1898; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1899; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0] 1900; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1901; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1902; AVX512BW-NEXT: vzeroupper 1903; AVX512BW-NEXT: retq 1904; 1905; AVX512DQVL-LABEL: constant_shift_v2i16: 1906; AVX512DQVL: # %bb.0: 1907; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1 1908; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0 1909; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 1910; AVX512DQVL-NEXT: retq 1911; 1912; AVX512BWVL-LABEL: constant_shift_v2i16: 1913; AVX512BWVL: # %bb.0: 1914; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1915; AVX512BWVL-NEXT: retq 1916; 1917; X86-SSE-LABEL: constant_shift_v2i16: 1918; X86-SSE: # %bb.0: 1919; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1920; X86-SSE-NEXT: psraw $3, %xmm1 1921; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] 1922; X86-SSE-NEXT: psraw $2, %xmm0 1923; X86-SSE-NEXT: pand %xmm2, %xmm0 1924; X86-SSE-NEXT: pandn %xmm1, %xmm2 1925; X86-SSE-NEXT: por %xmm2, %xmm0 1926; X86-SSE-NEXT: retl 1927 %shift = ashr <2 x i16> %a, <i16 2, i16 3> 1928 ret <2 x i16> %shift 1929} 1930 1931define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { 1932; SSE-LABEL: constant_shift_v8i8: 1933; SSE: # %bb.0: 1934; SSE-NEXT: pxor %xmm1, %xmm1 1935; SSE-NEXT: movdqa %xmm0, %xmm2 1936; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1937; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1938; SSE-NEXT: psraw $8, %xmm0 1939; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] 1940; SSE-NEXT: psrlw $8, %xmm0 1941; SSE-NEXT: packuswb %xmm2, %xmm0 1942; SSE-NEXT: retq 1943; 1944; AVX1-LABEL: constant_shift_v8i8: 1945; AVX1: # %bb.0: 1946; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1947; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1948; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1949; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1950; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1951; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1952; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1953; AVX1-NEXT: retq 1954; 1955; AVX2-LABEL: constant_shift_v8i8: 1956; AVX2: # %bb.0: 1957; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1958; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] 1959; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1960; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1961; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1962; AVX2-NEXT: vzeroupper 1963; AVX2-NEXT: retq 1964; 1965; XOP-LABEL: constant_shift_v8i8: 1966; XOP: # %bb.0: 1967; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1968; XOP-NEXT: retq 1969; 1970; AVX512DQ-LABEL: constant_shift_v8i8: 1971; AVX512DQ: # %bb.0: 1972; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1973; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1974; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1975; AVX512DQ-NEXT: vzeroupper 1976; AVX512DQ-NEXT: retq 1977; 1978; AVX512BW-LABEL: constant_shift_v8i8: 1979; AVX512BW: # %bb.0: 1980; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] 1981; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1982; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1983; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1984; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1985; AVX512BW-NEXT: vzeroupper 1986; AVX512BW-NEXT: retq 1987; 1988; AVX512DQVL-LABEL: constant_shift_v8i8: 1989; AVX512DQVL: # %bb.0: 1990; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1991; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1992; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1993; AVX512DQVL-NEXT: vzeroupper 1994; AVX512DQVL-NEXT: retq 1995; 1996; AVX512BWVL-LABEL: constant_shift_v8i8: 1997; AVX512BWVL: # %bb.0: 1998; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1999; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2000; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2001; AVX512BWVL-NEXT: vzeroupper 2002; AVX512BWVL-NEXT: retq 2003; 2004; X86-SSE-LABEL: constant_shift_v8i8: 2005; X86-SSE: # %bb.0: 2006; X86-SSE-NEXT: pxor %xmm1, %xmm1 2007; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2008; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2009; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2010; X86-SSE-NEXT: psraw $8, %xmm0 2011; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] 2012; X86-SSE-NEXT: psrlw $8, %xmm0 2013; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2014; X86-SSE-NEXT: retl 2015 %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 2016 ret <8 x i8> %shift 2017} 2018 2019define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { 2020; SSE-LABEL: constant_shift_v4i8: 2021; SSE: # %bb.0: 2022; SSE-NEXT: pxor %xmm1, %xmm1 2023; SSE-NEXT: movdqa %xmm0, %xmm2 2024; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2025; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2026; SSE-NEXT: psraw $8, %xmm0 2027; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] 2028; SSE-NEXT: psrlw $8, %xmm0 2029; SSE-NEXT: packuswb %xmm2, %xmm0 2030; SSE-NEXT: retq 2031; 2032; AVX1-LABEL: constant_shift_v4i8: 2033; AVX1: # %bb.0: 2034; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2035; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2036; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2037; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 2038; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] 2039; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2040; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2041; AVX1-NEXT: retq 2042; 2043; AVX2-LABEL: constant_shift_v4i8: 2044; AVX2: # %bb.0: 2045; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2046; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] 2047; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2048; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2049; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2050; AVX2-NEXT: vzeroupper 2051; AVX2-NEXT: retq 2052; 2053; XOP-LABEL: constant_shift_v4i8: 2054; XOP: # %bb.0: 2055; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2056; XOP-NEXT: retq 2057; 2058; AVX512DQ-LABEL: constant_shift_v4i8: 2059; AVX512DQ: # %bb.0: 2060; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 2061; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2062; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2063; AVX512DQ-NEXT: vzeroupper 2064; AVX512DQ-NEXT: retq 2065; 2066; AVX512BW-LABEL: constant_shift_v4i8: 2067; AVX512BW: # %bb.0: 2068; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] 2069; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 2070; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 2071; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2072; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2073; AVX512BW-NEXT: vzeroupper 2074; AVX512BW-NEXT: retq 2075; 2076; AVX512DQVL-LABEL: constant_shift_v4i8: 2077; AVX512DQVL: # %bb.0: 2078; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 2079; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2080; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2081; AVX512DQVL-NEXT: vzeroupper 2082; AVX512DQVL-NEXT: retq 2083; 2084; AVX512BWVL-LABEL: constant_shift_v4i8: 2085; AVX512BWVL: # %bb.0: 2086; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 2087; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2088; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2089; AVX512BWVL-NEXT: vzeroupper 2090; AVX512BWVL-NEXT: retq 2091; 2092; X86-SSE-LABEL: constant_shift_v4i8: 2093; X86-SSE: # %bb.0: 2094; X86-SSE-NEXT: pxor %xmm1, %xmm1 2095; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2096; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2097; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2098; X86-SSE-NEXT: psraw $8, %xmm0 2099; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] 2100; X86-SSE-NEXT: psrlw $8, %xmm0 2101; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2102; X86-SSE-NEXT: retl 2103 %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> 2104 ret <4 x i8> %shift 2105} 2106 2107define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { 2108; SSE-LABEL: constant_shift_v2i8: 2109; SSE: # %bb.0: 2110; SSE-NEXT: pxor %xmm1, %xmm1 2111; SSE-NEXT: movdqa %xmm0, %xmm2 2112; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2113; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2114; SSE-NEXT: psraw $8, %xmm0 2115; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] 2116; SSE-NEXT: psrlw $8, %xmm0 2117; SSE-NEXT: packuswb %xmm2, %xmm0 2118; SSE-NEXT: retq 2119; 2120; AVX1-LABEL: constant_shift_v2i8: 2121; AVX1: # %bb.0: 2122; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2123; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2124; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2125; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 2126; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] 2127; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2128; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2129; AVX1-NEXT: retq 2130; 2131; AVX2-LABEL: constant_shift_v2i8: 2132; AVX2: # %bb.0: 2133; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2134; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] 2135; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2136; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2137; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2138; AVX2-NEXT: vzeroupper 2139; AVX2-NEXT: retq 2140; 2141; XOP-LABEL: constant_shift_v2i8: 2142; XOP: # %bb.0: 2143; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2144; XOP-NEXT: retq 2145; 2146; AVX512DQ-LABEL: constant_shift_v2i8: 2147; AVX512DQ: # %bb.0: 2148; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 2149; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2150; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2151; AVX512DQ-NEXT: vzeroupper 2152; AVX512DQ-NEXT: retq 2153; 2154; AVX512BW-LABEL: constant_shift_v2i8: 2155; AVX512BW: # %bb.0: 2156; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 2157; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 2158; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 2159; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2160; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2161; AVX512BW-NEXT: vzeroupper 2162; AVX512BW-NEXT: retq 2163; 2164; AVX512DQVL-LABEL: constant_shift_v2i8: 2165; AVX512DQVL: # %bb.0: 2166; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 2167; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2168; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2169; AVX512DQVL-NEXT: vzeroupper 2170; AVX512DQVL-NEXT: retq 2171; 2172; AVX512BWVL-LABEL: constant_shift_v2i8: 2173; AVX512BWVL: # %bb.0: 2174; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 2175; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2176; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 2177; AVX512BWVL-NEXT: vzeroupper 2178; AVX512BWVL-NEXT: retq 2179; 2180; X86-SSE-LABEL: constant_shift_v2i8: 2181; X86-SSE: # %bb.0: 2182; X86-SSE-NEXT: pxor %xmm1, %xmm1 2183; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2184; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2185; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2186; X86-SSE-NEXT: psraw $8, %xmm0 2187; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] 2188; X86-SSE-NEXT: psrlw $8, %xmm0 2189; X86-SSE-NEXT: packuswb %xmm2, %xmm0 2190; X86-SSE-NEXT: retl 2191 %shift = ashr <2 x i8> %a, <i8 2, i8 3> 2192 ret <2 x i8> %shift 2193} 2194 2195; 2196; Uniform Constant Shifts 2197; 2198 2199define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { 2200; SSE-LABEL: splatconstant_shift_v2i32: 2201; SSE: # %bb.0: 2202; SSE-NEXT: psrad $5, %xmm0 2203; SSE-NEXT: retq 2204; 2205; AVX-LABEL: splatconstant_shift_v2i32: 2206; AVX: # %bb.0: 2207; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 2208; AVX-NEXT: retq 2209; 2210; XOP-LABEL: splatconstant_shift_v2i32: 2211; XOP: # %bb.0: 2212; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 2213; XOP-NEXT: retq 2214; 2215; AVX512-LABEL: splatconstant_shift_v2i32: 2216; AVX512: # %bb.0: 2217; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 2218; AVX512-NEXT: retq 2219; 2220; AVX512VL-LABEL: splatconstant_shift_v2i32: 2221; AVX512VL: # %bb.0: 2222; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 2223; AVX512VL-NEXT: retq 2224; 2225; X86-SSE-LABEL: splatconstant_shift_v2i32: 2226; X86-SSE: # %bb.0: 2227; X86-SSE-NEXT: psrad $5, %xmm0 2228; X86-SSE-NEXT: retl 2229 %shift = ashr <2 x i32> %a, <i32 5, i32 5> 2230 ret <2 x i32> %shift 2231} 2232 2233define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { 2234; SSE-LABEL: splatconstant_shift_v4i16: 2235; SSE: # %bb.0: 2236; SSE-NEXT: psraw $3, %xmm0 2237; SSE-NEXT: retq 2238; 2239; AVX-LABEL: splatconstant_shift_v4i16: 2240; AVX: # %bb.0: 2241; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 2242; AVX-NEXT: retq 2243; 2244; XOP-LABEL: splatconstant_shift_v4i16: 2245; XOP: # %bb.0: 2246; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 2247; XOP-NEXT: retq 2248; 2249; AVX512-LABEL: splatconstant_shift_v4i16: 2250; AVX512: # %bb.0: 2251; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 2252; AVX512-NEXT: retq 2253; 2254; AVX512VL-LABEL: splatconstant_shift_v4i16: 2255; AVX512VL: # %bb.0: 2256; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 2257; AVX512VL-NEXT: retq 2258; 2259; X86-SSE-LABEL: splatconstant_shift_v4i16: 2260; X86-SSE: # %bb.0: 2261; X86-SSE-NEXT: psraw $3, %xmm0 2262; X86-SSE-NEXT: retl 2263 %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> 2264 ret <4 x i16> %shift 2265} 2266 2267define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { 2268; SSE-LABEL: splatconstant_shift_v2i16: 2269; SSE: # %bb.0: 2270; SSE-NEXT: psraw $3, %xmm0 2271; SSE-NEXT: retq 2272; 2273; AVX-LABEL: splatconstant_shift_v2i16: 2274; AVX: # %bb.0: 2275; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 2276; AVX-NEXT: retq 2277; 2278; XOP-LABEL: splatconstant_shift_v2i16: 2279; XOP: # %bb.0: 2280; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 2281; XOP-NEXT: retq 2282; 2283; AVX512-LABEL: splatconstant_shift_v2i16: 2284; AVX512: # %bb.0: 2285; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 2286; AVX512-NEXT: retq 2287; 2288; AVX512VL-LABEL: splatconstant_shift_v2i16: 2289; AVX512VL: # %bb.0: 2290; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 2291; AVX512VL-NEXT: retq 2292; 2293; X86-SSE-LABEL: splatconstant_shift_v2i16: 2294; X86-SSE: # %bb.0: 2295; X86-SSE-NEXT: psraw $3, %xmm0 2296; X86-SSE-NEXT: retl 2297 %shift = ashr <2 x i16> %a, <i16 3, i16 3> 2298 ret <2 x i16> %shift 2299} 2300 2301define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { 2302; SSE-LABEL: splatconstant_shift_v8i8: 2303; SSE: # %bb.0: 2304; SSE-NEXT: psrlw $3, %xmm0 2305; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2306; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2307; SSE-NEXT: pxor %xmm1, %xmm0 2308; SSE-NEXT: psubb %xmm1, %xmm0 2309; SSE-NEXT: retq 2310; 2311; AVX1-LABEL: splatconstant_shift_v8i8: 2312; AVX1: # %bb.0: 2313; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 2314; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2315; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2316; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2317; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2318; AVX1-NEXT: retq 2319; 2320; AVX2-LABEL: splatconstant_shift_v8i8: 2321; AVX2: # %bb.0: 2322; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 2323; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2324; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2325; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2326; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2327; AVX2-NEXT: retq 2328; 2329; XOP-LABEL: splatconstant_shift_v8i8: 2330; XOP: # %bb.0: 2331; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2332; XOP-NEXT: retq 2333; 2334; AVX512-LABEL: splatconstant_shift_v8i8: 2335; AVX512: # %bb.0: 2336; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2337; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2338; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2339; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2340; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2341; AVX512-NEXT: retq 2342; 2343; AVX512DQVL-LABEL: splatconstant_shift_v8i8: 2344; AVX512DQVL: # %bb.0: 2345; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2346; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2347; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2348; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2349; AVX512DQVL-NEXT: retq 2350; 2351; AVX512BWVL-LABEL: splatconstant_shift_v8i8: 2352; AVX512BWVL: # %bb.0: 2353; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2354; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2355; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2356; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2357; AVX512BWVL-NEXT: retq 2358; 2359; X86-SSE-LABEL: splatconstant_shift_v8i8: 2360; X86-SSE: # %bb.0: 2361; X86-SSE-NEXT: psrlw $3, %xmm0 2362; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2363; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2364; X86-SSE-NEXT: pxor %xmm1, %xmm0 2365; X86-SSE-NEXT: psubb %xmm1, %xmm0 2366; X86-SSE-NEXT: retl 2367 %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 2368 ret <8 x i8> %shift 2369} 2370 2371define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { 2372; SSE-LABEL: splatconstant_shift_v4i8: 2373; SSE: # %bb.0: 2374; SSE-NEXT: psrlw $3, %xmm0 2375; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2376; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2377; SSE-NEXT: pxor %xmm1, %xmm0 2378; SSE-NEXT: psubb %xmm1, %xmm0 2379; SSE-NEXT: retq 2380; 2381; AVX1-LABEL: splatconstant_shift_v4i8: 2382; AVX1: # %bb.0: 2383; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 2384; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2385; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2386; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2387; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2388; AVX1-NEXT: retq 2389; 2390; AVX2-LABEL: splatconstant_shift_v4i8: 2391; AVX2: # %bb.0: 2392; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 2393; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2394; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2395; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2396; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2397; AVX2-NEXT: retq 2398; 2399; XOP-LABEL: splatconstant_shift_v4i8: 2400; XOP: # %bb.0: 2401; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2402; XOP-NEXT: retq 2403; 2404; AVX512-LABEL: splatconstant_shift_v4i8: 2405; AVX512: # %bb.0: 2406; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2407; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2408; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2409; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2410; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2411; AVX512-NEXT: retq 2412; 2413; AVX512DQVL-LABEL: splatconstant_shift_v4i8: 2414; AVX512DQVL: # %bb.0: 2415; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2416; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2417; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2418; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2419; AVX512DQVL-NEXT: retq 2420; 2421; AVX512BWVL-LABEL: splatconstant_shift_v4i8: 2422; AVX512BWVL: # %bb.0: 2423; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2424; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2425; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2426; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2427; AVX512BWVL-NEXT: retq 2428; 2429; X86-SSE-LABEL: splatconstant_shift_v4i8: 2430; X86-SSE: # %bb.0: 2431; X86-SSE-NEXT: psrlw $3, %xmm0 2432; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2433; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2434; X86-SSE-NEXT: pxor %xmm1, %xmm0 2435; X86-SSE-NEXT: psubb %xmm1, %xmm0 2436; X86-SSE-NEXT: retl 2437 %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3> 2438 ret <4 x i8> %shift 2439} 2440 2441define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { 2442; SSE-LABEL: splatconstant_shift_v2i8: 2443; SSE: # %bb.0: 2444; SSE-NEXT: psrlw $3, %xmm0 2445; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2446; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2447; SSE-NEXT: pxor %xmm1, %xmm0 2448; SSE-NEXT: psubb %xmm1, %xmm0 2449; SSE-NEXT: retq 2450; 2451; AVX1-LABEL: splatconstant_shift_v2i8: 2452; AVX1: # %bb.0: 2453; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 2454; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2455; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2456; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2457; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2458; AVX1-NEXT: retq 2459; 2460; AVX2-LABEL: splatconstant_shift_v2i8: 2461; AVX2: # %bb.0: 2462; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 2463; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2464; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2465; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2466; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2467; AVX2-NEXT: retq 2468; 2469; XOP-LABEL: splatconstant_shift_v2i8: 2470; XOP: # %bb.0: 2471; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2472; XOP-NEXT: retq 2473; 2474; AVX512-LABEL: splatconstant_shift_v2i8: 2475; AVX512: # %bb.0: 2476; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2477; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2478; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2479; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2480; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2481; AVX512-NEXT: retq 2482; 2483; AVX512DQVL-LABEL: splatconstant_shift_v2i8: 2484; AVX512DQVL: # %bb.0: 2485; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2486; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2487; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2488; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2489; AVX512DQVL-NEXT: retq 2490; 2491; AVX512BWVL-LABEL: splatconstant_shift_v2i8: 2492; AVX512BWVL: # %bb.0: 2493; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2494; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2495; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2496; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2497; AVX512BWVL-NEXT: retq 2498; 2499; X86-SSE-LABEL: splatconstant_shift_v2i8: 2500; X86-SSE: # %bb.0: 2501; X86-SSE-NEXT: psrlw $3, %xmm0 2502; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2503; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2504; X86-SSE-NEXT: pxor %xmm1, %xmm0 2505; X86-SSE-NEXT: psubb %xmm1, %xmm0 2506; X86-SSE-NEXT: retl 2507 %shift = ashr <2 x i8> %a, <i8 3, i8 3> 2508 ret <2 x i8> %shift 2509} 2510