1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 15 16; 17; Variable Shifts 18; 19 20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i64: 22; SSE2: # %bb.0: 23; SSE2-NEXT: movdqa %xmm0, %xmm2 24; SSE2-NEXT: psrlq %xmm1, %xmm2 25; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 26; SSE2-NEXT: psrlq %xmm1, %xmm0 27; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 28; SSE2-NEXT: retq 29; 30; SSE41-LABEL: var_shift_v2i64: 31; SSE41: # %bb.0: 32; SSE41-NEXT: movdqa %xmm0, %xmm2 33; SSE41-NEXT: psrlq %xmm1, %xmm2 34; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 35; SSE41-NEXT: psrlq %xmm1, %xmm0 36; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 37; SSE41-NEXT: retq 38; 39; AVX1-LABEL: var_shift_v2i64: 40; AVX1: # %bb.0: 41; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 42; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 43; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 44; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 45; AVX1-NEXT: retq 46; 47; AVX2-LABEL: var_shift_v2i64: 48; AVX2: # %bb.0: 49; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 50; AVX2-NEXT: retq 51; 52; XOPAVX1-LABEL: var_shift_v2i64: 53; XOPAVX1: # %bb.0: 54; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 55; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 56; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 57; XOPAVX1-NEXT: retq 58; 59; XOPAVX2-LABEL: var_shift_v2i64: 60; XOPAVX2: # %bb.0: 61; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 62; XOPAVX2-NEXT: retq 63; 64; AVX512-LABEL: var_shift_v2i64: 65; AVX512: # %bb.0: 66; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 67; AVX512-NEXT: retq 68; 69; AVX512VL-LABEL: var_shift_v2i64: 70; AVX512VL: # %bb.0: 71; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 72; AVX512VL-NEXT: retq 73; 74; X86-SSE-LABEL: var_shift_v2i64: 75; X86-SSE: # %bb.0: 76; X86-SSE-NEXT: movdqa %xmm0, %xmm2 77; X86-SSE-NEXT: psrlq %xmm1, %xmm2 78; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 79; X86-SSE-NEXT: psrlq %xmm1, %xmm0 80; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 81; X86-SSE-NEXT: retl 82 %shift = lshr <2 x i64> %a, %b 83 ret <2 x i64> %shift 84} 85 86define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 87; SSE2-LABEL: var_shift_v4i32: 88; SSE2: # %bb.0: 89; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 90; SSE2-NEXT: movdqa %xmm0, %xmm3 91; SSE2-NEXT: psrld %xmm2, %xmm3 92; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 93; SSE2-NEXT: movdqa %xmm0, %xmm2 94; SSE2-NEXT: psrld %xmm4, %xmm2 95; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 96; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 97; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 98; SSE2-NEXT: movdqa %xmm0, %xmm4 99; SSE2-NEXT: psrld %xmm3, %xmm4 100; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 101; SSE2-NEXT: psrld %xmm1, %xmm0 102; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 103; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 104; SSE2-NEXT: movaps %xmm2, %xmm0 105; SSE2-NEXT: retq 106; 107; SSE41-LABEL: var_shift_v4i32: 108; SSE41: # %bb.0: 109; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 110; SSE41-NEXT: movdqa %xmm0, %xmm3 111; SSE41-NEXT: psrld %xmm2, %xmm3 112; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 113; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 114; SSE41-NEXT: movdqa %xmm0, %xmm5 115; SSE41-NEXT: psrld %xmm4, %xmm5 116; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 117; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 118; SSE41-NEXT: movdqa %xmm0, %xmm3 119; SSE41-NEXT: psrld %xmm1, %xmm3 120; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 121; SSE41-NEXT: psrld %xmm1, %xmm0 122; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 123; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 124; SSE41-NEXT: retq 125; 126; AVX1-LABEL: var_shift_v4i32: 127; AVX1: # %bb.0: 128; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 129; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 130; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 131; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 132; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 133; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 134; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 135; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 136; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 137; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 138; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 139; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 140; AVX1-NEXT: retq 141; 142; AVX2-LABEL: var_shift_v4i32: 143; AVX2: # %bb.0: 144; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 145; AVX2-NEXT: retq 146; 147; XOPAVX1-LABEL: var_shift_v4i32: 148; XOPAVX1: # %bb.0: 149; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 150; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 151; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 152; XOPAVX1-NEXT: retq 153; 154; XOPAVX2-LABEL: var_shift_v4i32: 155; XOPAVX2: # %bb.0: 156; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 157; XOPAVX2-NEXT: retq 158; 159; AVX512-LABEL: var_shift_v4i32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 162; AVX512-NEXT: retq 163; 164; AVX512VL-LABEL: var_shift_v4i32: 165; AVX512VL: # %bb.0: 166; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 167; AVX512VL-NEXT: retq 168; 169; X86-SSE-LABEL: var_shift_v4i32: 170; X86-SSE: # %bb.0: 171; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 172; X86-SSE-NEXT: movdqa %xmm0, %xmm3 173; X86-SSE-NEXT: psrld %xmm2, %xmm3 174; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 175; X86-SSE-NEXT: movdqa %xmm0, %xmm2 176; X86-SSE-NEXT: psrld %xmm4, %xmm2 177; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 178; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 179; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 180; X86-SSE-NEXT: movdqa %xmm0, %xmm4 181; X86-SSE-NEXT: psrld %xmm3, %xmm4 182; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 183; X86-SSE-NEXT: psrld %xmm1, %xmm0 184; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 185; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 186; X86-SSE-NEXT: movaps %xmm2, %xmm0 187; X86-SSE-NEXT: retl 188 %shift = lshr <4 x i32> %a, %b 189 ret <4 x i32> %shift 190} 191 192define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 193; SSE2-LABEL: var_shift_v8i16: 194; SSE2: # %bb.0: 195; SSE2-NEXT: psllw $12, %xmm1 196; SSE2-NEXT: movdqa %xmm1, %xmm2 197; SSE2-NEXT: psraw $15, %xmm2 198; SSE2-NEXT: movdqa %xmm2, %xmm3 199; SSE2-NEXT: pandn %xmm0, %xmm3 200; SSE2-NEXT: psrlw $8, %xmm0 201; SSE2-NEXT: pand %xmm2, %xmm0 202; SSE2-NEXT: por %xmm3, %xmm0 203; SSE2-NEXT: paddw %xmm1, %xmm1 204; SSE2-NEXT: movdqa %xmm1, %xmm2 205; SSE2-NEXT: psraw $15, %xmm2 206; SSE2-NEXT: movdqa %xmm2, %xmm3 207; SSE2-NEXT: pandn %xmm0, %xmm3 208; SSE2-NEXT: psrlw $4, %xmm0 209; SSE2-NEXT: pand %xmm2, %xmm0 210; SSE2-NEXT: por %xmm3, %xmm0 211; SSE2-NEXT: paddw %xmm1, %xmm1 212; SSE2-NEXT: movdqa %xmm1, %xmm2 213; SSE2-NEXT: psraw $15, %xmm2 214; SSE2-NEXT: movdqa %xmm2, %xmm3 215; SSE2-NEXT: pandn %xmm0, %xmm3 216; SSE2-NEXT: psrlw $2, %xmm0 217; SSE2-NEXT: pand %xmm2, %xmm0 218; SSE2-NEXT: por %xmm3, %xmm0 219; SSE2-NEXT: paddw %xmm1, %xmm1 220; SSE2-NEXT: psraw $15, %xmm1 221; SSE2-NEXT: movdqa %xmm1, %xmm2 222; SSE2-NEXT: pandn %xmm0, %xmm2 223; SSE2-NEXT: psrlw $1, %xmm0 224; SSE2-NEXT: pand %xmm1, %xmm0 225; SSE2-NEXT: por %xmm2, %xmm0 226; SSE2-NEXT: retq 227; 228; SSE41-LABEL: var_shift_v8i16: 229; SSE41: # %bb.0: 230; SSE41-NEXT: movdqa %xmm0, %xmm2 231; SSE41-NEXT: movdqa %xmm1, %xmm0 232; SSE41-NEXT: psllw $12, %xmm0 233; SSE41-NEXT: psllw $4, %xmm1 234; SSE41-NEXT: por %xmm1, %xmm0 235; SSE41-NEXT: movdqa %xmm0, %xmm1 236; SSE41-NEXT: paddw %xmm0, %xmm1 237; SSE41-NEXT: movdqa %xmm2, %xmm3 238; SSE41-NEXT: psrlw $8, %xmm3 239; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 240; SSE41-NEXT: movdqa %xmm2, %xmm3 241; SSE41-NEXT: psrlw $4, %xmm3 242; SSE41-NEXT: movdqa %xmm1, %xmm0 243; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 244; SSE41-NEXT: movdqa %xmm2, %xmm3 245; SSE41-NEXT: psrlw $2, %xmm3 246; SSE41-NEXT: paddw %xmm1, %xmm1 247; SSE41-NEXT: movdqa %xmm1, %xmm0 248; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 249; SSE41-NEXT: movdqa %xmm2, %xmm3 250; SSE41-NEXT: psrlw $1, %xmm3 251; SSE41-NEXT: paddw %xmm1, %xmm1 252; SSE41-NEXT: movdqa %xmm1, %xmm0 253; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 254; SSE41-NEXT: movdqa %xmm2, %xmm0 255; SSE41-NEXT: retq 256; 257; AVX1-LABEL: var_shift_v8i16: 258; AVX1: # %bb.0: 259; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 260; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 261; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 262; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 263; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 264; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 265; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 266; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 267; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 268; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 269; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 270; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 271; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 272; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 273; AVX1-NEXT: retq 274; 275; AVX2-LABEL: var_shift_v8i16: 276; AVX2: # %bb.0: 277; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 278; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 279; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 280; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 281; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 282; AVX2-NEXT: vzeroupper 283; AVX2-NEXT: retq 284; 285; XOP-LABEL: var_shift_v8i16: 286; XOP: # %bb.0: 287; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 288; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 289; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 290; XOP-NEXT: retq 291; 292; AVX512DQ-LABEL: var_shift_v8i16: 293; AVX512DQ: # %bb.0: 294; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 295; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 296; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 297; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 298; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 299; AVX512DQ-NEXT: vzeroupper 300; AVX512DQ-NEXT: retq 301; 302; AVX512BW-LABEL: var_shift_v8i16: 303; AVX512BW: # %bb.0: 304; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 305; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 306; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 307; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 308; AVX512BW-NEXT: vzeroupper 309; AVX512BW-NEXT: retq 310; 311; AVX512DQVL-LABEL: var_shift_v8i16: 312; AVX512DQVL: # %bb.0: 313; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 314; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 315; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 316; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 317; AVX512DQVL-NEXT: vzeroupper 318; AVX512DQVL-NEXT: retq 319; 320; AVX512BWVL-LABEL: var_shift_v8i16: 321; AVX512BWVL: # %bb.0: 322; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 323; AVX512BWVL-NEXT: retq 324; 325; X86-SSE-LABEL: var_shift_v8i16: 326; X86-SSE: # %bb.0: 327; X86-SSE-NEXT: psllw $12, %xmm1 328; X86-SSE-NEXT: movdqa %xmm1, %xmm2 329; X86-SSE-NEXT: psraw $15, %xmm2 330; X86-SSE-NEXT: movdqa %xmm2, %xmm3 331; X86-SSE-NEXT: pandn %xmm0, %xmm3 332; X86-SSE-NEXT: psrlw $8, %xmm0 333; X86-SSE-NEXT: pand %xmm2, %xmm0 334; X86-SSE-NEXT: por %xmm3, %xmm0 335; X86-SSE-NEXT: paddw %xmm1, %xmm1 336; X86-SSE-NEXT: movdqa %xmm1, %xmm2 337; X86-SSE-NEXT: psraw $15, %xmm2 338; X86-SSE-NEXT: movdqa %xmm2, %xmm3 339; X86-SSE-NEXT: pandn %xmm0, %xmm3 340; X86-SSE-NEXT: psrlw $4, %xmm0 341; X86-SSE-NEXT: pand %xmm2, %xmm0 342; X86-SSE-NEXT: por %xmm3, %xmm0 343; X86-SSE-NEXT: paddw %xmm1, %xmm1 344; X86-SSE-NEXT: movdqa %xmm1, %xmm2 345; X86-SSE-NEXT: psraw $15, %xmm2 346; X86-SSE-NEXT: movdqa %xmm2, %xmm3 347; X86-SSE-NEXT: pandn %xmm0, %xmm3 348; X86-SSE-NEXT: psrlw $2, %xmm0 349; X86-SSE-NEXT: pand %xmm2, %xmm0 350; X86-SSE-NEXT: por %xmm3, %xmm0 351; X86-SSE-NEXT: paddw %xmm1, %xmm1 352; X86-SSE-NEXT: psraw $15, %xmm1 353; X86-SSE-NEXT: movdqa %xmm1, %xmm2 354; X86-SSE-NEXT: pandn %xmm0, %xmm2 355; X86-SSE-NEXT: psrlw $1, %xmm0 356; X86-SSE-NEXT: pand %xmm1, %xmm0 357; X86-SSE-NEXT: por %xmm2, %xmm0 358; X86-SSE-NEXT: retl 359 %shift = lshr <8 x i16> %a, %b 360 ret <8 x i16> %shift 361} 362 363define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 364; SSE2-LABEL: var_shift_v16i8: 365; SSE2: # %bb.0: 366; SSE2-NEXT: psllw $5, %xmm1 367; SSE2-NEXT: pxor %xmm2, %xmm2 368; SSE2-NEXT: pxor %xmm3, %xmm3 369; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 370; SSE2-NEXT: movdqa %xmm3, %xmm4 371; SSE2-NEXT: pandn %xmm0, %xmm4 372; SSE2-NEXT: psrlw $4, %xmm0 373; SSE2-NEXT: pand %xmm3, %xmm0 374; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 375; SSE2-NEXT: por %xmm4, %xmm0 376; SSE2-NEXT: paddb %xmm1, %xmm1 377; SSE2-NEXT: pxor %xmm3, %xmm3 378; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 379; SSE2-NEXT: movdqa %xmm3, %xmm4 380; SSE2-NEXT: pandn %xmm0, %xmm4 381; SSE2-NEXT: psrlw $2, %xmm0 382; SSE2-NEXT: pand %xmm3, %xmm0 383; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 384; SSE2-NEXT: por %xmm4, %xmm0 385; SSE2-NEXT: paddb %xmm1, %xmm1 386; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 387; SSE2-NEXT: movdqa %xmm2, %xmm1 388; SSE2-NEXT: pandn %xmm0, %xmm1 389; SSE2-NEXT: psrlw $1, %xmm0 390; SSE2-NEXT: pand %xmm2, %xmm0 391; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 392; SSE2-NEXT: por %xmm1, %xmm0 393; SSE2-NEXT: retq 394; 395; SSE41-LABEL: var_shift_v16i8: 396; SSE41: # %bb.0: 397; SSE41-NEXT: movdqa %xmm0, %xmm2 398; SSE41-NEXT: psllw $5, %xmm1 399; SSE41-NEXT: movdqa %xmm0, %xmm3 400; SSE41-NEXT: psrlw $4, %xmm3 401; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 402; SSE41-NEXT: movdqa %xmm1, %xmm0 403; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 404; SSE41-NEXT: movdqa %xmm2, %xmm3 405; SSE41-NEXT: psrlw $2, %xmm3 406; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 407; SSE41-NEXT: paddb %xmm1, %xmm1 408; SSE41-NEXT: movdqa %xmm1, %xmm0 409; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 410; SSE41-NEXT: movdqa %xmm2, %xmm3 411; SSE41-NEXT: psrlw $1, %xmm3 412; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 413; SSE41-NEXT: paddb %xmm1, %xmm1 414; SSE41-NEXT: movdqa %xmm1, %xmm0 415; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 416; SSE41-NEXT: movdqa %xmm2, %xmm0 417; SSE41-NEXT: retq 418; 419; AVX-LABEL: var_shift_v16i8: 420; AVX: # %bb.0: 421; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 422; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 423; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 424; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 425; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 426; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 427; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 428; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 429; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 430; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 431; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 432; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 433; AVX-NEXT: retq 434; 435; XOP-LABEL: var_shift_v16i8: 436; XOP: # %bb.0: 437; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 438; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 439; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 440; XOP-NEXT: retq 441; 442; AVX512DQ-LABEL: var_shift_v16i8: 443; AVX512DQ: # %bb.0: 444; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 445; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 446; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 447; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 448; AVX512DQ-NEXT: vzeroupper 449; AVX512DQ-NEXT: retq 450; 451; AVX512BW-LABEL: var_shift_v16i8: 452; AVX512BW: # %bb.0: 453; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 454; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 455; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 456; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 457; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 458; AVX512BW-NEXT: vzeroupper 459; AVX512BW-NEXT: retq 460; 461; AVX512DQVL-LABEL: var_shift_v16i8: 462; AVX512DQVL: # %bb.0: 463; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 464; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 465; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 466; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 467; AVX512DQVL-NEXT: vzeroupper 468; AVX512DQVL-NEXT: retq 469; 470; AVX512BWVL-LABEL: var_shift_v16i8: 471; AVX512BWVL: # %bb.0: 472; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 473; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 474; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 475; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 476; AVX512BWVL-NEXT: vzeroupper 477; AVX512BWVL-NEXT: retq 478; 479; X86-SSE-LABEL: var_shift_v16i8: 480; X86-SSE: # %bb.0: 481; X86-SSE-NEXT: psllw $5, %xmm1 482; X86-SSE-NEXT: pxor %xmm2, %xmm2 483; X86-SSE-NEXT: pxor %xmm3, %xmm3 484; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 485; X86-SSE-NEXT: movdqa %xmm3, %xmm4 486; X86-SSE-NEXT: pandn %xmm0, %xmm4 487; X86-SSE-NEXT: psrlw $4, %xmm0 488; X86-SSE-NEXT: pand %xmm3, %xmm0 489; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 490; X86-SSE-NEXT: por %xmm4, %xmm0 491; X86-SSE-NEXT: paddb %xmm1, %xmm1 492; X86-SSE-NEXT: pxor %xmm3, %xmm3 493; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 494; X86-SSE-NEXT: movdqa %xmm3, %xmm4 495; X86-SSE-NEXT: pandn %xmm0, %xmm4 496; X86-SSE-NEXT: psrlw $2, %xmm0 497; X86-SSE-NEXT: pand %xmm3, %xmm0 498; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 499; X86-SSE-NEXT: por %xmm4, %xmm0 500; X86-SSE-NEXT: paddb %xmm1, %xmm1 501; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 502; X86-SSE-NEXT: movdqa %xmm2, %xmm1 503; X86-SSE-NEXT: pandn %xmm0, %xmm1 504; X86-SSE-NEXT: psrlw $1, %xmm0 505; X86-SSE-NEXT: pand %xmm2, %xmm0 506; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 507; X86-SSE-NEXT: por %xmm1, %xmm0 508; X86-SSE-NEXT: retl 509 %shift = lshr <16 x i8> %a, %b 510 ret <16 x i8> %shift 511} 512 513; 514; Uniform Variable Shifts 515; 516 517define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 518; SSE-LABEL: splatvar_shift_v2i64: 519; SSE: # %bb.0: 520; SSE-NEXT: psrlq %xmm1, %xmm0 521; SSE-NEXT: retq 522; 523; AVX-LABEL: splatvar_shift_v2i64: 524; AVX: # %bb.0: 525; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 526; AVX-NEXT: retq 527; 528; XOP-LABEL: splatvar_shift_v2i64: 529; XOP: # %bb.0: 530; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 531; XOP-NEXT: retq 532; 533; AVX512-LABEL: splatvar_shift_v2i64: 534; AVX512: # %bb.0: 535; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 536; AVX512-NEXT: retq 537; 538; AVX512VL-LABEL: splatvar_shift_v2i64: 539; AVX512VL: # %bb.0: 540; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 541; AVX512VL-NEXT: retq 542; 543; X86-SSE-LABEL: splatvar_shift_v2i64: 544; X86-SSE: # %bb.0: 545; X86-SSE-NEXT: psrlq %xmm1, %xmm0 546; X86-SSE-NEXT: retl 547 %splat = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer 548 %shift = lshr <2 x i64> %a, %splat 549 ret <2 x i64> %shift 550} 551 552define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 553; SSE2-LABEL: splatvar_shift_v4i32: 554; SSE2: # %bb.0: 555; SSE2-NEXT: xorps %xmm2, %xmm2 556; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 557; SSE2-NEXT: psrld %xmm2, %xmm0 558; SSE2-NEXT: retq 559; 560; SSE41-LABEL: splatvar_shift_v4i32: 561; SSE41: # %bb.0: 562; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 563; SSE41-NEXT: psrld %xmm1, %xmm0 564; SSE41-NEXT: retq 565; 566; AVX-LABEL: splatvar_shift_v4i32: 567; AVX: # %bb.0: 568; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 569; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 570; AVX-NEXT: retq 571; 572; XOP-LABEL: splatvar_shift_v4i32: 573; XOP: # %bb.0: 574; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 575; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 576; XOP-NEXT: retq 577; 578; AVX512-LABEL: splatvar_shift_v4i32: 579; AVX512: # %bb.0: 580; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 581; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 582; AVX512-NEXT: retq 583; 584; AVX512VL-LABEL: splatvar_shift_v4i32: 585; AVX512VL: # %bb.0: 586; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 587; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 588; AVX512VL-NEXT: retq 589; 590; X86-SSE-LABEL: splatvar_shift_v4i32: 591; X86-SSE: # %bb.0: 592; X86-SSE-NEXT: xorps %xmm2, %xmm2 593; X86-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 594; X86-SSE-NEXT: psrld %xmm2, %xmm0 595; X86-SSE-NEXT: retl 596 %splat = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer 597 %shift = lshr <4 x i32> %a, %splat 598 ret <4 x i32> %shift 599} 600 601define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 602; SSE2-LABEL: splatvar_shift_v8i16: 603; SSE2: # %bb.0: 604; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 605; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 606; SSE2-NEXT: psrlw %xmm1, %xmm0 607; SSE2-NEXT: retq 608; 609; SSE41-LABEL: splatvar_shift_v8i16: 610; SSE41: # %bb.0: 611; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 612; SSE41-NEXT: psrlw %xmm1, %xmm0 613; SSE41-NEXT: retq 614; 615; AVX-LABEL: splatvar_shift_v8i16: 616; AVX: # %bb.0: 617; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 618; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 619; AVX-NEXT: retq 620; 621; XOP-LABEL: splatvar_shift_v8i16: 622; XOP: # %bb.0: 623; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 624; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 625; XOP-NEXT: retq 626; 627; AVX512-LABEL: splatvar_shift_v8i16: 628; AVX512: # %bb.0: 629; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 630; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 631; AVX512-NEXT: retq 632; 633; AVX512VL-LABEL: splatvar_shift_v8i16: 634; AVX512VL: # %bb.0: 635; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 636; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 637; AVX512VL-NEXT: retq 638; 639; X86-SSE-LABEL: splatvar_shift_v8i16: 640; X86-SSE: # %bb.0: 641; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 642; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 643; X86-SSE-NEXT: psrlw %xmm1, %xmm0 644; X86-SSE-NEXT: retl 645 %splat = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer 646 %shift = lshr <8 x i16> %a, %splat 647 ret <8 x i16> %shift 648} 649 650define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 651; SSE2-LABEL: splatvar_shift_v16i8: 652; SSE2: # %bb.0: 653; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 654; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 655; SSE2-NEXT: psrlw %xmm1, %xmm0 656; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 657; SSE2-NEXT: psrlw %xmm1, %xmm2 658; SSE2-NEXT: psrlw $8, %xmm2 659; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 660; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] 661; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 662; SSE2-NEXT: pand %xmm1, %xmm0 663; SSE2-NEXT: retq 664; 665; SSE41-LABEL: splatvar_shift_v16i8: 666; SSE41: # %bb.0: 667; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 668; SSE41-NEXT: psrlw %xmm1, %xmm0 669; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 670; SSE41-NEXT: psrlw %xmm1, %xmm2 671; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 672; SSE41-NEXT: pand %xmm2, %xmm0 673; SSE41-NEXT: retq 674; 675; AVX1-LABEL: splatvar_shift_v16i8: 676; AVX1: # %bb.0: 677; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 678; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 679; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 680; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 681; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 682; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 683; AVX1-NEXT: retq 684; 685; AVX2-LABEL: splatvar_shift_v16i8: 686; AVX2: # %bb.0: 687; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 688; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 689; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 690; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 691; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 692; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 693; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 694; AVX2-NEXT: retq 695; 696; XOPAVX1-LABEL: splatvar_shift_v16i8: 697; XOPAVX1: # %bb.0: 698; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 699; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 700; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 701; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 702; XOPAVX1-NEXT: retq 703; 704; XOPAVX2-LABEL: splatvar_shift_v16i8: 705; XOPAVX2: # %bb.0: 706; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 707; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 708; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 709; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 710; XOPAVX2-NEXT: retq 711; 712; AVX512DQ-LABEL: splatvar_shift_v16i8: 713; AVX512DQ: # %bb.0: 714; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 715; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 716; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 717; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 718; AVX512DQ-NEXT: vzeroupper 719; AVX512DQ-NEXT: retq 720; 721; AVX512BW-LABEL: splatvar_shift_v16i8: 722; AVX512BW: # %bb.0: 723; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 724; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 725; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 726; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 727; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 728; AVX512BW-NEXT: vzeroupper 729; AVX512BW-NEXT: retq 730; 731; AVX512DQVL-LABEL: splatvar_shift_v16i8: 732; AVX512DQVL: # %bb.0: 733; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 734; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 735; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 736; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 737; AVX512DQVL-NEXT: vzeroupper 738; AVX512DQVL-NEXT: retq 739; 740; AVX512BWVL-LABEL: splatvar_shift_v16i8: 741; AVX512BWVL: # %bb.0: 742; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 743; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 744; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 745; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 746; AVX512BWVL-NEXT: vzeroupper 747; AVX512BWVL-NEXT: retq 748; 749; X86-SSE-LABEL: splatvar_shift_v16i8: 750; X86-SSE: # %bb.0: 751; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 752; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 753; X86-SSE-NEXT: psrlw %xmm1, %xmm0 754; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 755; X86-SSE-NEXT: psrlw %xmm1, %xmm2 756; X86-SSE-NEXT: psrlw $8, %xmm2 757; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 758; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] 759; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 760; X86-SSE-NEXT: pand %xmm1, %xmm0 761; X86-SSE-NEXT: retl 762 %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer 763 %shift = lshr <16 x i8> %a, %splat 764 ret <16 x i8> %shift 765} 766 767; 768; Uniform Variable Modulo Shifts 769; 770 771define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 772; SSE-LABEL: splatvar_modulo_shift_v2i64: 773; SSE: # %bb.0: 774; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 775; SSE-NEXT: psrlq %xmm1, %xmm0 776; SSE-NEXT: retq 777; 778; AVX-LABEL: splatvar_modulo_shift_v2i64: 779; AVX: # %bb.0: 780; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 781; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 782; AVX-NEXT: retq 783; 784; XOP-LABEL: splatvar_modulo_shift_v2i64: 785; XOP: # %bb.0: 786; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 787; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 788; XOP-NEXT: retq 789; 790; AVX512-LABEL: splatvar_modulo_shift_v2i64: 791; AVX512: # %bb.0: 792; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 793; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 794; AVX512-NEXT: retq 795; 796; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: 797; AVX512VL: # %bb.0: 798; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 799; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 800; AVX512VL-NEXT: retq 801; 802; X86-SSE-LABEL: splatvar_modulo_shift_v2i64: 803; X86-SSE: # %bb.0: 804; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 805; X86-SSE-NEXT: psrlq %xmm1, %xmm0 806; X86-SSE-NEXT: retl 807 %mod = and <2 x i64> %b, <i64 63, i64 63> 808 %splat = shufflevector <2 x i64> %mod, <2 x i64> poison, <2 x i32> zeroinitializer 809 %shift = lshr <2 x i64> %a, %splat 810 ret <2 x i64> %shift 811} 812 813define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 814; SSE-LABEL: splatvar_modulo_shift_v4i32: 815; SSE: # %bb.0: 816; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 817; SSE-NEXT: psrld %xmm1, %xmm0 818; SSE-NEXT: retq 819; 820; AVX-LABEL: splatvar_modulo_shift_v4i32: 821; AVX: # %bb.0: 822; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 823; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 824; AVX-NEXT: retq 825; 826; XOP-LABEL: splatvar_modulo_shift_v4i32: 827; XOP: # %bb.0: 828; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 829; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 830; XOP-NEXT: retq 831; 832; AVX512-LABEL: splatvar_modulo_shift_v4i32: 833; AVX512: # %bb.0: 834; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 835; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 836; AVX512-NEXT: retq 837; 838; AVX512VL-LABEL: splatvar_modulo_shift_v4i32: 839; AVX512VL: # %bb.0: 840; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 841; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 842; AVX512VL-NEXT: retq 843; 844; X86-SSE-LABEL: splatvar_modulo_shift_v4i32: 845; X86-SSE: # %bb.0: 846; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 847; X86-SSE-NEXT: psrld %xmm1, %xmm0 848; X86-SSE-NEXT: retl 849 %mod = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31> 850 %splat = shufflevector <4 x i32> %mod, <4 x i32> poison, <4 x i32> zeroinitializer 851 %shift = lshr <4 x i32> %a, %splat 852 ret <4 x i32> %shift 853} 854 855define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 856; SSE-LABEL: splatvar_modulo_shift_v8i16: 857; SSE: # %bb.0: 858; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 859; SSE-NEXT: psrlw %xmm1, %xmm0 860; SSE-NEXT: retq 861; 862; AVX-LABEL: splatvar_modulo_shift_v8i16: 863; AVX: # %bb.0: 864; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 865; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 866; AVX-NEXT: retq 867; 868; XOP-LABEL: splatvar_modulo_shift_v8i16: 869; XOP: # %bb.0: 870; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 871; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 872; XOP-NEXT: retq 873; 874; AVX512-LABEL: splatvar_modulo_shift_v8i16: 875; AVX512: # %bb.0: 876; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 877; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 878; AVX512-NEXT: retq 879; 880; AVX512VL-LABEL: splatvar_modulo_shift_v8i16: 881; AVX512VL: # %bb.0: 882; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 883; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 884; AVX512VL-NEXT: retq 885; 886; X86-SSE-LABEL: splatvar_modulo_shift_v8i16: 887; X86-SSE: # %bb.0: 888; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 889; X86-SSE-NEXT: psrlw %xmm1, %xmm0 890; X86-SSE-NEXT: retl 891 %mod = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 892 %splat = shufflevector <8 x i16> %mod, <8 x i16> poison, <8 x i32> zeroinitializer 893 %shift = lshr <8 x i16> %a, %splat 894 ret <8 x i16> %shift 895} 896 897define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 898; SSE2-LABEL: splatvar_modulo_shift_v16i8: 899; SSE2: # %bb.0: 900; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 901; SSE2-NEXT: psrlw %xmm1, %xmm0 902; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 903; SSE2-NEXT: psrlw %xmm1, %xmm2 904; SSE2-NEXT: psrlw $8, %xmm2 905; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 906; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] 907; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 908; SSE2-NEXT: pand %xmm1, %xmm0 909; SSE2-NEXT: retq 910; 911; SSE41-LABEL: splatvar_modulo_shift_v16i8: 912; SSE41: # %bb.0: 913; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 914; SSE41-NEXT: psrlw %xmm1, %xmm0 915; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 916; SSE41-NEXT: psrlw %xmm1, %xmm2 917; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 918; SSE41-NEXT: pand %xmm2, %xmm0 919; SSE41-NEXT: retq 920; 921; AVX1-LABEL: splatvar_modulo_shift_v16i8: 922; AVX1: # %bb.0: 923; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 924; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 925; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 926; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 927; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 928; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 929; AVX1-NEXT: retq 930; 931; AVX2-LABEL: splatvar_modulo_shift_v16i8: 932; AVX2: # %bb.0: 933; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 934; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 935; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 936; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 937; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 938; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 939; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 940; AVX2-NEXT: retq 941; 942; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: 943; XOPAVX1: # %bb.0: 944; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 945; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 946; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 947; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 948; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 949; XOPAVX1-NEXT: retq 950; 951; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: 952; XOPAVX2: # %bb.0: 953; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 954; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 955; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 956; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 957; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 958; XOPAVX2-NEXT: retq 959; 960; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: 961; AVX512DQ: # %bb.0: 962; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 963; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 964; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 965; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 966; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 967; AVX512DQ-NEXT: vzeroupper 968; AVX512DQ-NEXT: retq 969; 970; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: 971; AVX512BW: # %bb.0: 972; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 973; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 974; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 975; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 976; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 977; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 978; AVX512BW-NEXT: vzeroupper 979; AVX512BW-NEXT: retq 980; 981; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: 982; AVX512DQVL: # %bb.0: 983; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 984; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 985; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 986; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 987; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 988; AVX512DQVL-NEXT: vzeroupper 989; AVX512DQVL-NEXT: retq 990; 991; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: 992; AVX512BWVL: # %bb.0: 993; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 994; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 995; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 996; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 997; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 998; AVX512BWVL-NEXT: vzeroupper 999; AVX512BWVL-NEXT: retq 1000; 1001; X86-SSE-LABEL: splatvar_modulo_shift_v16i8: 1002; X86-SSE: # %bb.0: 1003; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1004; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1005; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1006; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1007; X86-SSE-NEXT: psrlw $8, %xmm2 1008; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1009; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] 1010; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1011; X86-SSE-NEXT: pand %xmm1, %xmm0 1012; X86-SSE-NEXT: retl 1013 %mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 1014 %splat = shufflevector <16 x i8> %mod, <16 x i8> poison, <16 x i32> zeroinitializer 1015 %shift = lshr <16 x i8> %a, %splat 1016 ret <16 x i8> %shift 1017} 1018 1019; 1020; Constant Shifts 1021; 1022 1023define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 1024; SSE2-LABEL: constant_shift_v2i64: 1025; SSE2: # %bb.0: 1026; SSE2-NEXT: movdqa %xmm0, %xmm1 1027; SSE2-NEXT: psrlq $1, %xmm1 1028; SSE2-NEXT: psrlq $7, %xmm0 1029; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1030; SSE2-NEXT: retq 1031; 1032; SSE41-LABEL: constant_shift_v2i64: 1033; SSE41: # %bb.0: 1034; SSE41-NEXT: movdqa %xmm0, %xmm1 1035; SSE41-NEXT: psrlq $7, %xmm1 1036; SSE41-NEXT: psrlq $1, %xmm0 1037; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1038; SSE41-NEXT: retq 1039; 1040; AVX1-LABEL: constant_shift_v2i64: 1041; AVX1: # %bb.0: 1042; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1043; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1044; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1045; AVX1-NEXT: retq 1046; 1047; AVX2-LABEL: constant_shift_v2i64: 1048; AVX2: # %bb.0: 1049; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1050; AVX2-NEXT: retq 1051; 1052; XOPAVX1-LABEL: constant_shift_v2i64: 1053; XOPAVX1: # %bb.0: 1054; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1055; XOPAVX1-NEXT: retq 1056; 1057; XOPAVX2-LABEL: constant_shift_v2i64: 1058; XOPAVX2: # %bb.0: 1059; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1060; XOPAVX2-NEXT: retq 1061; 1062; AVX512-LABEL: constant_shift_v2i64: 1063; AVX512: # %bb.0: 1064; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1065; AVX512-NEXT: retq 1066; 1067; AVX512VL-LABEL: constant_shift_v2i64: 1068; AVX512VL: # %bb.0: 1069; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1070; AVX512VL-NEXT: retq 1071; 1072; X86-SSE-LABEL: constant_shift_v2i64: 1073; X86-SSE: # %bb.0: 1074; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1075; X86-SSE-NEXT: psrlq $1, %xmm1 1076; X86-SSE-NEXT: psrlq $7, %xmm0 1077; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1078; X86-SSE-NEXT: retl 1079 %shift = lshr <2 x i64> %a, <i64 1, i64 7> 1080 ret <2 x i64> %shift 1081} 1082 1083define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 1084; SSE2-LABEL: constant_shift_v4i32: 1085; SSE2: # %bb.0: 1086; SSE2-NEXT: movdqa %xmm0, %xmm1 1087; SSE2-NEXT: psrld $7, %xmm1 1088; SSE2-NEXT: movdqa %xmm0, %xmm2 1089; SSE2-NEXT: psrld $6, %xmm2 1090; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1091; SSE2-NEXT: movdqa %xmm0, %xmm1 1092; SSE2-NEXT: psrld $5, %xmm1 1093; SSE2-NEXT: psrld $4, %xmm0 1094; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1095; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1096; SSE2-NEXT: retq 1097; 1098; SSE41-LABEL: constant_shift_v4i32: 1099; SSE41: # %bb.0: 1100; SSE41-NEXT: movdqa %xmm0, %xmm1 1101; SSE41-NEXT: psrld $7, %xmm1 1102; SSE41-NEXT: movdqa %xmm0, %xmm2 1103; SSE41-NEXT: psrld $5, %xmm2 1104; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1105; SSE41-NEXT: movdqa %xmm0, %xmm1 1106; SSE41-NEXT: psrld $6, %xmm1 1107; SSE41-NEXT: psrld $4, %xmm0 1108; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1109; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1110; SSE41-NEXT: retq 1111; 1112; AVX1-LABEL: constant_shift_v4i32: 1113; AVX1: # %bb.0: 1114; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 1115; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 1116; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1117; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 1118; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 1119; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1120; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1121; AVX1-NEXT: retq 1122; 1123; AVX2-LABEL: constant_shift_v4i32: 1124; AVX2: # %bb.0: 1125; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1126; AVX2-NEXT: retq 1127; 1128; XOPAVX1-LABEL: constant_shift_v4i32: 1129; XOPAVX1: # %bb.0: 1130; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1131; XOPAVX1-NEXT: retq 1132; 1133; XOPAVX2-LABEL: constant_shift_v4i32: 1134; XOPAVX2: # %bb.0: 1135; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1136; XOPAVX2-NEXT: retq 1137; 1138; AVX512-LABEL: constant_shift_v4i32: 1139; AVX512: # %bb.0: 1140; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1141; AVX512-NEXT: retq 1142; 1143; AVX512VL-LABEL: constant_shift_v4i32: 1144; AVX512VL: # %bb.0: 1145; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1146; AVX512VL-NEXT: retq 1147; 1148; X86-SSE-LABEL: constant_shift_v4i32: 1149; X86-SSE: # %bb.0: 1150; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1151; X86-SSE-NEXT: psrld $7, %xmm1 1152; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1153; X86-SSE-NEXT: psrld $6, %xmm2 1154; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1155; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1156; X86-SSE-NEXT: psrld $5, %xmm1 1157; X86-SSE-NEXT: psrld $4, %xmm0 1158; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1159; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1160; X86-SSE-NEXT: retl 1161 %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1162 ret <4 x i32> %shift 1163} 1164 1165define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind { 1166; SSE-LABEL: constant_shift_v8i16_pairs: 1167; SSE: # %bb.0: 1168; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096] 1169; SSE-NEXT: retq 1170; 1171; AVX1-LABEL: constant_shift_v8i16_pairs: 1172; AVX1: # %bb.0: 1173; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096] 1174; AVX1-NEXT: retq 1175; 1176; AVX2-LABEL: constant_shift_v8i16_pairs: 1177; AVX2: # %bb.0: 1178; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1179; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1180; AVX2-NEXT: retq 1181; 1182; XOP-LABEL: constant_shift_v8i16_pairs: 1183; XOP: # %bb.0: 1184; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1185; XOP-NEXT: retq 1186; 1187; AVX512DQ-LABEL: constant_shift_v8i16_pairs: 1188; AVX512DQ: # %bb.0: 1189; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1190; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1191; AVX512DQ-NEXT: retq 1192; 1193; AVX512BW-LABEL: constant_shift_v8i16_pairs: 1194; AVX512BW: # %bb.0: 1195; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1196; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,1,3,3,2,2,4,4] 1197; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1198; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1199; AVX512BW-NEXT: vzeroupper 1200; AVX512BW-NEXT: retq 1201; 1202; AVX512DQVL-LABEL: constant_shift_v8i16_pairs: 1203; AVX512DQVL: # %bb.0: 1204; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1205; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1206; AVX512DQVL-NEXT: retq 1207; 1208; AVX512BWVL-LABEL: constant_shift_v8i16_pairs: 1209; AVX512BWVL: # %bb.0: 1210; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1211; AVX512BWVL-NEXT: retq 1212; 1213; X86-SSE-LABEL: constant_shift_v8i16_pairs: 1214; X86-SSE: # %bb.0: 1215; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096] 1216; X86-SSE-NEXT: retl 1217 %shift = lshr <8 x i16> %a, <i16 1, i16 1, i16 3, i16 3, i16 2, i16 2, i16 4, i16 4> 1218 ret <8 x i16> %shift 1219} 1220 1221define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 1222; SSE2-LABEL: constant_shift_v8i16: 1223; SSE2: # %bb.0: 1224; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 1225; SSE2-NEXT: pandn %xmm0, %xmm1 1226; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1227; SSE2-NEXT: por %xmm1, %xmm0 1228; SSE2-NEXT: retq 1229; 1230; SSE41-LABEL: constant_shift_v8i16: 1231; SSE41: # %bb.0: 1232; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,4096,2048,1024,512] 1233; SSE41-NEXT: pmulhuw %xmm0, %xmm1 1234; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1235; SSE41-NEXT: retq 1236; 1237; AVX-LABEL: constant_shift_v8i16: 1238; AVX: # %bb.0: 1239; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512] 1240; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1241; AVX-NEXT: retq 1242; 1243; XOP-LABEL: constant_shift_v8i16: 1244; XOP: # %bb.0: 1245; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1246; XOP-NEXT: retq 1247; 1248; AVX512DQ-LABEL: constant_shift_v8i16: 1249; AVX512DQ: # %bb.0: 1250; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512] 1251; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1252; AVX512DQ-NEXT: retq 1253; 1254; AVX512BW-LABEL: constant_shift_v8i16: 1255; AVX512BW: # %bb.0: 1256; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1257; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1258; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1259; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1260; AVX512BW-NEXT: vzeroupper 1261; AVX512BW-NEXT: retq 1262; 1263; AVX512DQVL-LABEL: constant_shift_v8i16: 1264; AVX512DQVL: # %bb.0: 1265; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512] 1266; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1267; AVX512DQVL-NEXT: retq 1268; 1269; AVX512BWVL-LABEL: constant_shift_v8i16: 1270; AVX512BWVL: # %bb.0: 1271; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1272; AVX512BWVL-NEXT: retq 1273; 1274; X86-SSE-LABEL: constant_shift_v8i16: 1275; X86-SSE: # %bb.0: 1276; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 1277; X86-SSE-NEXT: pandn %xmm0, %xmm1 1278; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1279; X86-SSE-NEXT: por %xmm1, %xmm0 1280; X86-SSE-NEXT: retl 1281 %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1282 ret <8 x i16> %shift 1283} 1284 1285define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { 1286; SSE-LABEL: constant_shift_v16i8_pairs: 1287; SSE: # %bb.0: 1288; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] 1289; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1290; SSE-NEXT: retq 1291; 1292; AVX-LABEL: constant_shift_v16i8_pairs: 1293; AVX: # %bb.0: 1294; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] 1295; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1296; AVX-NEXT: retq 1297; 1298; XOP-LABEL: constant_shift_v16i8_pairs: 1299; XOP: # %bb.0: 1300; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1301; XOP-NEXT: retq 1302; 1303; AVX512DQ-LABEL: constant_shift_v16i8_pairs: 1304; AVX512DQ: # %bb.0: 1305; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] 1306; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1307; AVX512DQ-NEXT: retq 1308; 1309; AVX512BW-LABEL: constant_shift_v16i8_pairs: 1310; AVX512BW: # %bb.0: 1311; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1312; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,2,4,6,1,2,3,4] 1313; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1314; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1315; AVX512BW-NEXT: vzeroupper 1316; AVX512BW-NEXT: retq 1317; 1318; AVX512DQVL-LABEL: constant_shift_v16i8_pairs: 1319; AVX512DQVL: # %bb.0: 1320; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] 1321; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1322; AVX512DQVL-NEXT: retq 1323; 1324; AVX512BWVL-LABEL: constant_shift_v16i8_pairs: 1325; AVX512BWVL: # %bb.0: 1326; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1327; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1328; AVX512BWVL-NEXT: retq 1329; 1330; X86-SSE-LABEL: constant_shift_v16i8_pairs: 1331; X86-SSE: # %bb.0: 1332; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] 1333; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1334; X86-SSE-NEXT: retl 1335 %shift = lshr <16 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4> 1336 ret <16 x i8> %shift 1337} 1338 1339define <16 x i8> @constant_shift_v16i8_quads(<16 x i8> %a) nounwind { 1340; SSE-LABEL: constant_shift_v16i8_quads: 1341; SSE: # %bb.0: 1342; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192] 1343; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1344; SSE-NEXT: retq 1345; 1346; AVX1-LABEL: constant_shift_v16i8_quads: 1347; AVX1: # %bb.0: 1348; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192] 1349; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1350; AVX1-NEXT: retq 1351; 1352; AVX2-LABEL: constant_shift_v16i8_quads: 1353; AVX2: # %bb.0: 1354; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1355; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1356; AVX2-NEXT: retq 1357; 1358; XOP-LABEL: constant_shift_v16i8_quads: 1359; XOP: # %bb.0: 1360; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1361; XOP-NEXT: retq 1362; 1363; AVX512-LABEL: constant_shift_v16i8_quads: 1364; AVX512: # %bb.0: 1365; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1366; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1367; AVX512-NEXT: retq 1368; 1369; AVX512VL-LABEL: constant_shift_v16i8_quads: 1370; AVX512VL: # %bb.0: 1371; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1372; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1373; AVX512VL-NEXT: retq 1374; 1375; X86-SSE-LABEL: constant_shift_v16i8_quads: 1376; X86-SSE: # %bb.0: 1377; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192] 1378; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1379; X86-SSE-NEXT: retl 1380 %shift = lshr <16 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 4, i8 4, i8 4, i8 4, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3> 1381 ret <16 x i8> %shift 1382} 1383 1384define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 1385; SSE2-LABEL: constant_shift_v16i8: 1386; SSE2: # %bb.0: 1387; SSE2-NEXT: pxor %xmm1, %xmm1 1388; SSE2-NEXT: movdqa %xmm0, %xmm2 1389; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1390; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2,4,8,16,32,64,128,256] 1391; SSE2-NEXT: psrlw $8, %xmm2 1392; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1393; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] 1394; SSE2-NEXT: psrlw $8, %xmm0 1395; SSE2-NEXT: packuswb %xmm2, %xmm0 1396; SSE2-NEXT: retq 1397; 1398; SSE41-LABEL: constant_shift_v16i8: 1399; SSE41: # %bb.0: 1400; SSE41-NEXT: pxor %xmm2, %xmm2 1401; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1402; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1403; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16,32,64,128,256] 1404; SSE41-NEXT: psrlw $8, %xmm0 1405; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] 1406; SSE41-NEXT: psrlw $8, %xmm1 1407; SSE41-NEXT: packuswb %xmm0, %xmm1 1408; SSE41-NEXT: movdqa %xmm1, %xmm0 1409; SSE41-NEXT: retq 1410; 1411; AVX1-LABEL: constant_shift_v16i8: 1412; AVX1: # %bb.0: 1413; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1414; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1415; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] 1416; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1417; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1418; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1419; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1420; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1421; AVX1-NEXT: retq 1422; 1423; AVX2-LABEL: constant_shift_v16i8: 1424; AVX2: # %bb.0: 1425; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1426; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,2,4,8,16,32,64,128,256] 1427; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1428; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1429; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1430; AVX2-NEXT: vzeroupper 1431; AVX2-NEXT: retq 1432; 1433; XOP-LABEL: constant_shift_v16i8: 1434; XOP: # %bb.0: 1435; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1436; XOP-NEXT: retq 1437; 1438; AVX512DQ-LABEL: constant_shift_v16i8: 1439; AVX512DQ: # %bb.0: 1440; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1441; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1442; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1443; AVX512DQ-NEXT: vzeroupper 1444; AVX512DQ-NEXT: retq 1445; 1446; AVX512BW-LABEL: constant_shift_v16i8: 1447; AVX512BW: # %bb.0: 1448; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1449; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1450; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1451; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1452; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1453; AVX512BW-NEXT: vzeroupper 1454; AVX512BW-NEXT: retq 1455; 1456; AVX512DQVL-LABEL: constant_shift_v16i8: 1457; AVX512DQVL: # %bb.0: 1458; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1459; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1460; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1461; AVX512DQVL-NEXT: vzeroupper 1462; AVX512DQVL-NEXT: retq 1463; 1464; AVX512BWVL-LABEL: constant_shift_v16i8: 1465; AVX512BWVL: # %bb.0: 1466; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1467; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1468; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1469; AVX512BWVL-NEXT: vzeroupper 1470; AVX512BWVL-NEXT: retq 1471; 1472; X86-SSE-LABEL: constant_shift_v16i8: 1473; X86-SSE: # %bb.0: 1474; X86-SSE-NEXT: pxor %xmm1, %xmm1 1475; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1476; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1477; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [2,4,8,16,32,64,128,256] 1478; X86-SSE-NEXT: psrlw $8, %xmm2 1479; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1480; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] 1481; X86-SSE-NEXT: psrlw $8, %xmm0 1482; X86-SSE-NEXT: packuswb %xmm2, %xmm0 1483; X86-SSE-NEXT: retl 1484 %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1485 ret <16 x i8> %shift 1486} 1487 1488; 1489; Uniform Constant Shifts 1490; 1491 1492define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1493; SSE-LABEL: splatconstant_shift_v2i64: 1494; SSE: # %bb.0: 1495; SSE-NEXT: psrlq $7, %xmm0 1496; SSE-NEXT: retq 1497; 1498; AVX-LABEL: splatconstant_shift_v2i64: 1499; AVX: # %bb.0: 1500; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0 1501; AVX-NEXT: retq 1502; 1503; XOP-LABEL: splatconstant_shift_v2i64: 1504; XOP: # %bb.0: 1505; XOP-NEXT: vpsrlq $7, %xmm0, %xmm0 1506; XOP-NEXT: retq 1507; 1508; AVX512-LABEL: splatconstant_shift_v2i64: 1509; AVX512: # %bb.0: 1510; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 1511; AVX512-NEXT: retq 1512; 1513; AVX512VL-LABEL: splatconstant_shift_v2i64: 1514; AVX512VL: # %bb.0: 1515; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0 1516; AVX512VL-NEXT: retq 1517; 1518; X86-SSE-LABEL: splatconstant_shift_v2i64: 1519; X86-SSE: # %bb.0: 1520; X86-SSE-NEXT: psrlq $7, %xmm0 1521; X86-SSE-NEXT: retl 1522 %shift = lshr <2 x i64> %a, <i64 7, i64 7> 1523 ret <2 x i64> %shift 1524} 1525 1526define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1527; SSE-LABEL: splatconstant_shift_v4i32: 1528; SSE: # %bb.0: 1529; SSE-NEXT: psrld $5, %xmm0 1530; SSE-NEXT: retq 1531; 1532; AVX-LABEL: splatconstant_shift_v4i32: 1533; AVX: # %bb.0: 1534; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 1535; AVX-NEXT: retq 1536; 1537; XOP-LABEL: splatconstant_shift_v4i32: 1538; XOP: # %bb.0: 1539; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 1540; XOP-NEXT: retq 1541; 1542; AVX512-LABEL: splatconstant_shift_v4i32: 1543; AVX512: # %bb.0: 1544; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 1545; AVX512-NEXT: retq 1546; 1547; AVX512VL-LABEL: splatconstant_shift_v4i32: 1548; AVX512VL: # %bb.0: 1549; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0 1550; AVX512VL-NEXT: retq 1551; 1552; X86-SSE-LABEL: splatconstant_shift_v4i32: 1553; X86-SSE: # %bb.0: 1554; X86-SSE-NEXT: psrld $5, %xmm0 1555; X86-SSE-NEXT: retl 1556 %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1557 ret <4 x i32> %shift 1558} 1559 1560define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1561; SSE-LABEL: splatconstant_shift_v8i16: 1562; SSE: # %bb.0: 1563; SSE-NEXT: psrlw $3, %xmm0 1564; SSE-NEXT: retq 1565; 1566; AVX-LABEL: splatconstant_shift_v8i16: 1567; AVX: # %bb.0: 1568; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1569; AVX-NEXT: retq 1570; 1571; XOP-LABEL: splatconstant_shift_v8i16: 1572; XOP: # %bb.0: 1573; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 1574; XOP-NEXT: retq 1575; 1576; AVX512-LABEL: splatconstant_shift_v8i16: 1577; AVX512: # %bb.0: 1578; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1579; AVX512-NEXT: retq 1580; 1581; AVX512VL-LABEL: splatconstant_shift_v8i16: 1582; AVX512VL: # %bb.0: 1583; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 1584; AVX512VL-NEXT: retq 1585; 1586; X86-SSE-LABEL: splatconstant_shift_v8i16: 1587; X86-SSE: # %bb.0: 1588; X86-SSE-NEXT: psrlw $3, %xmm0 1589; X86-SSE-NEXT: retl 1590 %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1591 ret <8 x i16> %shift 1592} 1593 1594define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1595; SSE-LABEL: splatconstant_shift_v16i8: 1596; SSE: # %bb.0: 1597; SSE-NEXT: psrlw $3, %xmm0 1598; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1599; SSE-NEXT: retq 1600; 1601; AVX-LABEL: splatconstant_shift_v16i8: 1602; AVX: # %bb.0: 1603; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1604; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1605; AVX-NEXT: retq 1606; 1607; XOP-LABEL: splatconstant_shift_v16i8: 1608; XOP: # %bb.0: 1609; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1610; XOP-NEXT: retq 1611; 1612; AVX512-LABEL: splatconstant_shift_v16i8: 1613; AVX512: # %bb.0: 1614; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1615; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1616; AVX512-NEXT: retq 1617; 1618; AVX512VL-LABEL: splatconstant_shift_v16i8: 1619; AVX512VL: # %bb.0: 1620; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 1621; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1622; AVX512VL-NEXT: retq 1623; 1624; X86-SSE-LABEL: splatconstant_shift_v16i8: 1625; X86-SSE: # %bb.0: 1626; X86-SSE-NEXT: psrlw $3, %xmm0 1627; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1628; X86-SSE-NEXT: retl 1629 %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1630 ret <16 x i8> %shift 1631} 1632 1633define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) nounwind { 1634; SSE2-LABEL: vector_variable_shift_right: 1635; SSE2: # %bb.0: 1636; SSE2-NEXT: xorps %xmm4, %xmm4 1637; SSE2-NEXT: xorps %xmm5, %xmm5 1638; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] 1639; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] 1640; SSE2-NEXT: pslld $31, %xmm0 1641; SSE2-NEXT: psrad $31, %xmm0 1642; SSE2-NEXT: movdqa %xmm3, %xmm1 1643; SSE2-NEXT: psrld %xmm4, %xmm1 1644; SSE2-NEXT: psrld %xmm5, %xmm3 1645; SSE2-NEXT: pand %xmm0, %xmm1 1646; SSE2-NEXT: pandn %xmm3, %xmm0 1647; SSE2-NEXT: por %xmm1, %xmm0 1648; SSE2-NEXT: retq 1649; 1650; SSE41-LABEL: vector_variable_shift_right: 1651; SSE41: # %bb.0: 1652; SSE41-NEXT: pslld $31, %xmm0 1653; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1654; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1655; SSE41-NEXT: movdqa %xmm3, %xmm4 1656; SSE41-NEXT: psrld %xmm1, %xmm4 1657; SSE41-NEXT: psrld %xmm2, %xmm3 1658; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 1659; SSE41-NEXT: movaps %xmm3, %xmm0 1660; SSE41-NEXT: retq 1661; 1662; AVX1-LABEL: vector_variable_shift_right: 1663; AVX1: # %bb.0: 1664; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 1665; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1666; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1667; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 1668; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 1669; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1670; AVX1-NEXT: retq 1671; 1672; AVX2-LABEL: vector_variable_shift_right: 1673; AVX2: # %bb.0: 1674; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 1675; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 1676; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 1677; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1678; AVX2-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 1679; AVX2-NEXT: retq 1680; 1681; XOPAVX1-LABEL: vector_variable_shift_right: 1682; XOPAVX1: # %bb.0: 1683; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 1684; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 1685; XOPAVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] 1686; XOPAVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1687; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1688; XOPAVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 1689; XOPAVX1-NEXT: vpshld %xmm0, %xmm3, %xmm0 1690; XOPAVX1-NEXT: retq 1691; 1692; XOPAVX2-LABEL: vector_variable_shift_right: 1693; XOPAVX2: # %bb.0: 1694; XOPAVX2-NEXT: vpslld $31, %xmm0, %xmm0 1695; XOPAVX2-NEXT: vbroadcastss %xmm1, %xmm1 1696; XOPAVX2-NEXT: vbroadcastss %xmm2, %xmm2 1697; XOPAVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1698; XOPAVX2-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 1699; XOPAVX2-NEXT: retq 1700; 1701; AVX512DQ-LABEL: vector_variable_shift_right: 1702; AVX512DQ: # %bb.0: 1703; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 1704; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 1705; AVX512DQ-NEXT: vpbroadcastd %xmm1, %xmm0 1706; AVX512DQ-NEXT: vpbroadcastd %xmm2, %xmm1 1707; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1708; AVX512DQ-NEXT: vpsrlvd %xmm1, %xmm3, %xmm0 1709; AVX512DQ-NEXT: vzeroupper 1710; AVX512DQ-NEXT: retq 1711; 1712; AVX512BW-LABEL: vector_variable_shift_right: 1713; AVX512BW: # %bb.0: 1714; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 1715; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 1716; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm0 1717; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm1 1718; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1719; AVX512BW-NEXT: vpsrlvd %xmm1, %xmm3, %xmm0 1720; AVX512BW-NEXT: vzeroupper 1721; AVX512BW-NEXT: retq 1722; 1723; AVX512DQVL-LABEL: vector_variable_shift_right: 1724; AVX512DQVL: # %bb.0: 1725; AVX512DQVL-NEXT: vpslld $31, %xmm0, %xmm0 1726; AVX512DQVL-NEXT: vpmovd2m %xmm0, %k1 1727; AVX512DQVL-NEXT: vpbroadcastd %xmm2, %xmm0 1728; AVX512DQVL-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 1729; AVX512DQVL-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 1730; AVX512DQVL-NEXT: retq 1731; 1732; AVX512BWVL-LABEL: vector_variable_shift_right: 1733; AVX512BWVL: # %bb.0: 1734; AVX512BWVL-NEXT: vpslld $31, %xmm0, %xmm0 1735; AVX512BWVL-NEXT: vptestmd %xmm0, %xmm0, %k1 1736; AVX512BWVL-NEXT: vpbroadcastd %xmm2, %xmm0 1737; AVX512BWVL-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 1738; AVX512BWVL-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 1739; AVX512BWVL-NEXT: retq 1740; 1741; X86-SSE-LABEL: vector_variable_shift_right: 1742; X86-SSE: # %bb.0: 1743; X86-SSE-NEXT: pushl %ebp 1744; X86-SSE-NEXT: movl %esp, %ebp 1745; X86-SSE-NEXT: andl $-16, %esp 1746; X86-SSE-NEXT: subl $16, %esp 1747; X86-SSE-NEXT: xorps %xmm3, %xmm3 1748; X86-SSE-NEXT: xorps %xmm4, %xmm4 1749; X86-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] 1750; X86-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] 1751; X86-SSE-NEXT: pslld $31, %xmm0 1752; X86-SSE-NEXT: psrad $31, %xmm0 1753; X86-SSE-NEXT: movdqa 8(%ebp), %xmm1 1754; X86-SSE-NEXT: movdqa %xmm1, %xmm2 1755; X86-SSE-NEXT: psrld %xmm3, %xmm2 1756; X86-SSE-NEXT: psrld %xmm4, %xmm1 1757; X86-SSE-NEXT: pand %xmm0, %xmm2 1758; X86-SSE-NEXT: pandn %xmm1, %xmm0 1759; X86-SSE-NEXT: por %xmm2, %xmm0 1760; X86-SSE-NEXT: movl %ebp, %esp 1761; X86-SSE-NEXT: popl %ebp 1762; X86-SSE-NEXT: retl 1763 %splat1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer 1764 %splat2 = shufflevector <4 x i32> %y, <4 x i32> poison, <4 x i32> zeroinitializer 1765 %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2 1766 %sh = lshr <4 x i32> %z, %sel 1767 ret <4 x i32> %sh 1768} 1769