1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 15 16; 17; Variable Shifts 18; 19 20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i64: 22; SSE2: # %bb.0: 23; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 24; SSE2-NEXT: movdqa %xmm2, %xmm3 25; SSE2-NEXT: psrlq %xmm1, %xmm3 26; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 27; SSE2-NEXT: psrlq %xmm4, %xmm2 28; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 29; SSE2-NEXT: movdqa %xmm0, %xmm3 30; SSE2-NEXT: psrlq %xmm1, %xmm3 31; SSE2-NEXT: psrlq %xmm4, %xmm0 32; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 33; SSE2-NEXT: xorpd %xmm2, %xmm0 34; SSE2-NEXT: psubq %xmm2, %xmm0 35; SSE2-NEXT: retq 36; 37; SSE41-LABEL: var_shift_v2i64: 38; SSE41: # %bb.0: 39; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 40; SSE41-NEXT: movdqa %xmm2, %xmm3 41; SSE41-NEXT: psrlq %xmm1, %xmm3 42; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 43; SSE41-NEXT: psrlq %xmm4, %xmm2 44; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 45; SSE41-NEXT: movdqa %xmm0, %xmm3 46; SSE41-NEXT: psrlq %xmm1, %xmm3 47; SSE41-NEXT: psrlq %xmm4, %xmm0 48; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 49; SSE41-NEXT: pxor %xmm2, %xmm0 50; SSE41-NEXT: psubq %xmm2, %xmm0 51; SSE41-NEXT: retq 52; 53; AVX1-LABEL: var_shift_v2i64: 54; AVX1: # %bb.0: 55; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 56; AVX1-NEXT: # xmm2 = mem[0,0] 57; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 58; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 59; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 60; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 61; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 62; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm0 63; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 64; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 65; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 66; AVX1-NEXT: retq 67; 68; AVX2-LABEL: var_shift_v2i64: 69; AVX2: # %bb.0: 70; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 71; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 72; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 73; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 74; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 75; AVX2-NEXT: retq 76; 77; XOP-LABEL: var_shift_v2i64: 78; XOP: # %bb.0: 79; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 80; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 81; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 82; XOP-NEXT: retq 83; 84; AVX512-LABEL: var_shift_v2i64: 85; AVX512: # %bb.0: 86; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 87; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 88; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 89; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 90; AVX512-NEXT: vzeroupper 91; AVX512-NEXT: retq 92; 93; AVX512VL-LABEL: var_shift_v2i64: 94; AVX512VL: # %bb.0: 95; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 96; AVX512VL-NEXT: retq 97; 98; X86-SSE-LABEL: var_shift_v2i64: 99; X86-SSE: # %bb.0: 100; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 101; X86-SSE-NEXT: movdqa %xmm2, %xmm3 102; X86-SSE-NEXT: psrlq %xmm1, %xmm3 103; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 104; X86-SSE-NEXT: psrlq %xmm4, %xmm2 105; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 106; X86-SSE-NEXT: movdqa %xmm0, %xmm3 107; X86-SSE-NEXT: psrlq %xmm1, %xmm3 108; X86-SSE-NEXT: psrlq %xmm4, %xmm0 109; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 110; X86-SSE-NEXT: xorpd %xmm2, %xmm0 111; X86-SSE-NEXT: psubq %xmm2, %xmm0 112; X86-SSE-NEXT: retl 113 %shift = ashr <2 x i64> %a, %b 114 ret <2 x i64> %shift 115} 116 117define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 118; SSE2-LABEL: var_shift_v4i32: 119; SSE2: # %bb.0: 120; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 121; SSE2-NEXT: movdqa %xmm0, %xmm3 122; SSE2-NEXT: psrad %xmm2, %xmm3 123; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 124; SSE2-NEXT: movdqa %xmm0, %xmm2 125; SSE2-NEXT: psrad %xmm4, %xmm2 126; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 127; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 128; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 129; SSE2-NEXT: movdqa %xmm0, %xmm4 130; SSE2-NEXT: psrad %xmm3, %xmm4 131; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 132; SSE2-NEXT: psrad %xmm1, %xmm0 133; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 134; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 135; SSE2-NEXT: movaps %xmm2, %xmm0 136; SSE2-NEXT: retq 137; 138; SSE41-LABEL: var_shift_v4i32: 139; SSE41: # %bb.0: 140; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 141; SSE41-NEXT: movdqa %xmm0, %xmm3 142; SSE41-NEXT: psrad %xmm2, %xmm3 143; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 144; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 145; SSE41-NEXT: movdqa %xmm0, %xmm5 146; SSE41-NEXT: psrad %xmm4, %xmm5 147; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 148; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 149; SSE41-NEXT: movdqa %xmm0, %xmm3 150; SSE41-NEXT: psrad %xmm1, %xmm3 151; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 152; SSE41-NEXT: psrad %xmm1, %xmm0 153; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 154; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 155; SSE41-NEXT: retq 156; 157; AVX1-LABEL: var_shift_v4i32: 158; AVX1: # %bb.0: 159; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 160; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 161; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 162; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 163; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 164; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 165; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 166; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 167; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 168; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 169; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 170; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 171; AVX1-NEXT: retq 172; 173; AVX2-LABEL: var_shift_v4i32: 174; AVX2: # %bb.0: 175; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 176; AVX2-NEXT: retq 177; 178; XOPAVX1-LABEL: var_shift_v4i32: 179; XOPAVX1: # %bb.0: 180; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 181; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 182; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 183; XOPAVX1-NEXT: retq 184; 185; XOPAVX2-LABEL: var_shift_v4i32: 186; XOPAVX2: # %bb.0: 187; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 188; XOPAVX2-NEXT: retq 189; 190; AVX512-LABEL: var_shift_v4i32: 191; AVX512: # %bb.0: 192; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 193; AVX512-NEXT: retq 194; 195; AVX512VL-LABEL: var_shift_v4i32: 196; AVX512VL: # %bb.0: 197; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 198; AVX512VL-NEXT: retq 199; 200; X86-SSE-LABEL: var_shift_v4i32: 201; X86-SSE: # %bb.0: 202; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 203; X86-SSE-NEXT: movdqa %xmm0, %xmm3 204; X86-SSE-NEXT: psrad %xmm2, %xmm3 205; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 206; X86-SSE-NEXT: movdqa %xmm0, %xmm2 207; X86-SSE-NEXT: psrad %xmm4, %xmm2 208; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 209; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 210; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 211; X86-SSE-NEXT: movdqa %xmm0, %xmm4 212; X86-SSE-NEXT: psrad %xmm3, %xmm4 213; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 214; X86-SSE-NEXT: psrad %xmm1, %xmm0 215; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 216; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 217; X86-SSE-NEXT: movaps %xmm2, %xmm0 218; X86-SSE-NEXT: retl 219 %shift = ashr <4 x i32> %a, %b 220 ret <4 x i32> %shift 221} 222 223define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 224; SSE2-LABEL: var_shift_v8i16: 225; SSE2: # %bb.0: 226; SSE2-NEXT: psllw $12, %xmm1 227; SSE2-NEXT: movdqa %xmm1, %xmm2 228; SSE2-NEXT: psraw $15, %xmm2 229; SSE2-NEXT: movdqa %xmm2, %xmm3 230; SSE2-NEXT: pandn %xmm0, %xmm3 231; SSE2-NEXT: psraw $8, %xmm0 232; SSE2-NEXT: pand %xmm2, %xmm0 233; SSE2-NEXT: por %xmm3, %xmm0 234; SSE2-NEXT: paddw %xmm1, %xmm1 235; SSE2-NEXT: movdqa %xmm1, %xmm2 236; SSE2-NEXT: psraw $15, %xmm2 237; SSE2-NEXT: movdqa %xmm2, %xmm3 238; SSE2-NEXT: pandn %xmm0, %xmm3 239; SSE2-NEXT: psraw $4, %xmm0 240; SSE2-NEXT: pand %xmm2, %xmm0 241; SSE2-NEXT: por %xmm3, %xmm0 242; SSE2-NEXT: paddw %xmm1, %xmm1 243; SSE2-NEXT: movdqa %xmm1, %xmm2 244; SSE2-NEXT: psraw $15, %xmm2 245; SSE2-NEXT: movdqa %xmm2, %xmm3 246; SSE2-NEXT: pandn %xmm0, %xmm3 247; SSE2-NEXT: psraw $2, %xmm0 248; SSE2-NEXT: pand %xmm2, %xmm0 249; SSE2-NEXT: por %xmm3, %xmm0 250; SSE2-NEXT: paddw %xmm1, %xmm1 251; SSE2-NEXT: psraw $15, %xmm1 252; SSE2-NEXT: movdqa %xmm1, %xmm2 253; SSE2-NEXT: pandn %xmm0, %xmm2 254; SSE2-NEXT: psraw $1, %xmm0 255; SSE2-NEXT: pand %xmm1, %xmm0 256; SSE2-NEXT: por %xmm2, %xmm0 257; SSE2-NEXT: retq 258; 259; SSE41-LABEL: var_shift_v8i16: 260; SSE41: # %bb.0: 261; SSE41-NEXT: movdqa %xmm0, %xmm2 262; SSE41-NEXT: movdqa %xmm1, %xmm0 263; SSE41-NEXT: psllw $12, %xmm0 264; SSE41-NEXT: psllw $4, %xmm1 265; SSE41-NEXT: por %xmm1, %xmm0 266; SSE41-NEXT: movdqa %xmm0, %xmm1 267; SSE41-NEXT: paddw %xmm0, %xmm1 268; SSE41-NEXT: movdqa %xmm2, %xmm3 269; SSE41-NEXT: psraw $8, %xmm3 270; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 271; SSE41-NEXT: movdqa %xmm2, %xmm3 272; SSE41-NEXT: psraw $4, %xmm3 273; SSE41-NEXT: movdqa %xmm1, %xmm0 274; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 275; SSE41-NEXT: movdqa %xmm2, %xmm3 276; SSE41-NEXT: psraw $2, %xmm3 277; SSE41-NEXT: paddw %xmm1, %xmm1 278; SSE41-NEXT: movdqa %xmm1, %xmm0 279; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 280; SSE41-NEXT: movdqa %xmm2, %xmm3 281; SSE41-NEXT: psraw $1, %xmm3 282; SSE41-NEXT: paddw %xmm1, %xmm1 283; SSE41-NEXT: movdqa %xmm1, %xmm0 284; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 285; SSE41-NEXT: movdqa %xmm2, %xmm0 286; SSE41-NEXT: retq 287; 288; AVX1-LABEL: var_shift_v8i16: 289; AVX1: # %bb.0: 290; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 291; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 292; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 293; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 294; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 295; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 296; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 297; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 298; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 299; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 300; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 301; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 302; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 303; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 304; AVX1-NEXT: retq 305; 306; AVX2-LABEL: var_shift_v8i16: 307; AVX2: # %bb.0: 308; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 309; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 310; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 311; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 312; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 313; AVX2-NEXT: vzeroupper 314; AVX2-NEXT: retq 315; 316; XOP-LABEL: var_shift_v8i16: 317; XOP: # %bb.0: 318; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 319; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 320; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 321; XOP-NEXT: retq 322; 323; AVX512DQ-LABEL: var_shift_v8i16: 324; AVX512DQ: # %bb.0: 325; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 326; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 327; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 328; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 329; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 330; AVX512DQ-NEXT: vzeroupper 331; AVX512DQ-NEXT: retq 332; 333; AVX512BW-LABEL: var_shift_v8i16: 334; AVX512BW: # %bb.0: 335; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 336; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 337; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 338; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 339; AVX512BW-NEXT: vzeroupper 340; AVX512BW-NEXT: retq 341; 342; AVX512DQVL-LABEL: var_shift_v8i16: 343; AVX512DQVL: # %bb.0: 344; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 345; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 346; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 347; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 348; AVX512DQVL-NEXT: vzeroupper 349; AVX512DQVL-NEXT: retq 350; 351; AVX512BWVL-LABEL: var_shift_v8i16: 352; AVX512BWVL: # %bb.0: 353; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 354; AVX512BWVL-NEXT: retq 355; 356; X86-SSE-LABEL: var_shift_v8i16: 357; X86-SSE: # %bb.0: 358; X86-SSE-NEXT: psllw $12, %xmm1 359; X86-SSE-NEXT: movdqa %xmm1, %xmm2 360; X86-SSE-NEXT: psraw $15, %xmm2 361; X86-SSE-NEXT: movdqa %xmm2, %xmm3 362; X86-SSE-NEXT: pandn %xmm0, %xmm3 363; X86-SSE-NEXT: psraw $8, %xmm0 364; X86-SSE-NEXT: pand %xmm2, %xmm0 365; X86-SSE-NEXT: por %xmm3, %xmm0 366; X86-SSE-NEXT: paddw %xmm1, %xmm1 367; X86-SSE-NEXT: movdqa %xmm1, %xmm2 368; X86-SSE-NEXT: psraw $15, %xmm2 369; X86-SSE-NEXT: movdqa %xmm2, %xmm3 370; X86-SSE-NEXT: pandn %xmm0, %xmm3 371; X86-SSE-NEXT: psraw $4, %xmm0 372; X86-SSE-NEXT: pand %xmm2, %xmm0 373; X86-SSE-NEXT: por %xmm3, %xmm0 374; X86-SSE-NEXT: paddw %xmm1, %xmm1 375; X86-SSE-NEXT: movdqa %xmm1, %xmm2 376; X86-SSE-NEXT: psraw $15, %xmm2 377; X86-SSE-NEXT: movdqa %xmm2, %xmm3 378; X86-SSE-NEXT: pandn %xmm0, %xmm3 379; X86-SSE-NEXT: psraw $2, %xmm0 380; X86-SSE-NEXT: pand %xmm2, %xmm0 381; X86-SSE-NEXT: por %xmm3, %xmm0 382; X86-SSE-NEXT: paddw %xmm1, %xmm1 383; X86-SSE-NEXT: psraw $15, %xmm1 384; X86-SSE-NEXT: movdqa %xmm1, %xmm2 385; X86-SSE-NEXT: pandn %xmm0, %xmm2 386; X86-SSE-NEXT: psraw $1, %xmm0 387; X86-SSE-NEXT: pand %xmm1, %xmm0 388; X86-SSE-NEXT: por %xmm2, %xmm0 389; X86-SSE-NEXT: retl 390 %shift = ashr <8 x i16> %a, %b 391 ret <8 x i16> %shift 392} 393 394define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 395; SSE2-LABEL: var_shift_v16i8: 396; SSE2: # %bb.0: 397; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 398; SSE2-NEXT: psllw $5, %xmm1 399; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 400; SSE2-NEXT: pxor %xmm3, %xmm3 401; SSE2-NEXT: pxor %xmm5, %xmm5 402; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 403; SSE2-NEXT: movdqa %xmm5, %xmm6 404; SSE2-NEXT: pandn %xmm2, %xmm6 405; SSE2-NEXT: psraw $4, %xmm2 406; SSE2-NEXT: pand %xmm5, %xmm2 407; SSE2-NEXT: por %xmm6, %xmm2 408; SSE2-NEXT: paddw %xmm4, %xmm4 409; SSE2-NEXT: pxor %xmm5, %xmm5 410; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 411; SSE2-NEXT: movdqa %xmm5, %xmm6 412; SSE2-NEXT: pandn %xmm2, %xmm6 413; SSE2-NEXT: psraw $2, %xmm2 414; SSE2-NEXT: pand %xmm5, %xmm2 415; SSE2-NEXT: por %xmm6, %xmm2 416; SSE2-NEXT: paddw %xmm4, %xmm4 417; SSE2-NEXT: pxor %xmm5, %xmm5 418; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 419; SSE2-NEXT: movdqa %xmm5, %xmm4 420; SSE2-NEXT: pandn %xmm2, %xmm4 421; SSE2-NEXT: psraw $1, %xmm2 422; SSE2-NEXT: pand %xmm5, %xmm2 423; SSE2-NEXT: por %xmm4, %xmm2 424; SSE2-NEXT: psrlw $8, %xmm2 425; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 426; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 427; SSE2-NEXT: pxor %xmm4, %xmm4 428; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 429; SSE2-NEXT: movdqa %xmm4, %xmm5 430; SSE2-NEXT: pandn %xmm0, %xmm5 431; SSE2-NEXT: psraw $4, %xmm0 432; SSE2-NEXT: pand %xmm4, %xmm0 433; SSE2-NEXT: por %xmm5, %xmm0 434; SSE2-NEXT: paddw %xmm1, %xmm1 435; SSE2-NEXT: pxor %xmm4, %xmm4 436; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 437; SSE2-NEXT: movdqa %xmm4, %xmm5 438; SSE2-NEXT: pandn %xmm0, %xmm5 439; SSE2-NEXT: psraw $2, %xmm0 440; SSE2-NEXT: pand %xmm4, %xmm0 441; SSE2-NEXT: por %xmm5, %xmm0 442; SSE2-NEXT: paddw %xmm1, %xmm1 443; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 444; SSE2-NEXT: movdqa %xmm3, %xmm1 445; SSE2-NEXT: pandn %xmm0, %xmm1 446; SSE2-NEXT: psraw $1, %xmm0 447; SSE2-NEXT: pand %xmm3, %xmm0 448; SSE2-NEXT: por %xmm1, %xmm0 449; SSE2-NEXT: psrlw $8, %xmm0 450; SSE2-NEXT: packuswb %xmm2, %xmm0 451; SSE2-NEXT: retq 452; 453; SSE41-LABEL: var_shift_v16i8: 454; SSE41: # %bb.0: 455; SSE41-NEXT: movdqa %xmm0, %xmm2 456; SSE41-NEXT: psllw $5, %xmm1 457; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 458; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 459; SSE41-NEXT: movdqa %xmm3, %xmm4 460; SSE41-NEXT: psraw $4, %xmm4 461; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 462; SSE41-NEXT: movdqa %xmm3, %xmm4 463; SSE41-NEXT: psraw $2, %xmm4 464; SSE41-NEXT: paddw %xmm0, %xmm0 465; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 466; SSE41-NEXT: movdqa %xmm3, %xmm4 467; SSE41-NEXT: psraw $1, %xmm4 468; SSE41-NEXT: paddw %xmm0, %xmm0 469; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 470; SSE41-NEXT: psrlw $8, %xmm3 471; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 472; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 473; SSE41-NEXT: movdqa %xmm1, %xmm2 474; SSE41-NEXT: psraw $4, %xmm2 475; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 476; SSE41-NEXT: movdqa %xmm1, %xmm2 477; SSE41-NEXT: psraw $2, %xmm2 478; SSE41-NEXT: paddw %xmm0, %xmm0 479; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 480; SSE41-NEXT: movdqa %xmm1, %xmm2 481; SSE41-NEXT: psraw $1, %xmm2 482; SSE41-NEXT: paddw %xmm0, %xmm0 483; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 484; SSE41-NEXT: psrlw $8, %xmm1 485; SSE41-NEXT: packuswb %xmm3, %xmm1 486; SSE41-NEXT: movdqa %xmm1, %xmm0 487; SSE41-NEXT: retq 488; 489; AVX-LABEL: var_shift_v16i8: 490; AVX: # %bb.0: 491; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 492; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 493; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 494; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 495; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 496; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 497; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 498; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 499; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 500; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 501; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 502; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 503; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 504; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 505; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 506; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 507; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 508; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 509; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 510; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 511; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 512; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 513; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 514; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 515; AVX-NEXT: retq 516; 517; XOP-LABEL: var_shift_v16i8: 518; XOP: # %bb.0: 519; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 520; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 521; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 522; XOP-NEXT: retq 523; 524; AVX512DQ-LABEL: var_shift_v16i8: 525; AVX512DQ: # %bb.0: 526; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 527; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 528; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 529; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 530; AVX512DQ-NEXT: vzeroupper 531; AVX512DQ-NEXT: retq 532; 533; AVX512BW-LABEL: var_shift_v16i8: 534; AVX512BW: # %bb.0: 535; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 536; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 537; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 538; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 539; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 540; AVX512BW-NEXT: vzeroupper 541; AVX512BW-NEXT: retq 542; 543; AVX512DQVL-LABEL: var_shift_v16i8: 544; AVX512DQVL: # %bb.0: 545; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 546; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 547; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 548; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 549; AVX512DQVL-NEXT: vzeroupper 550; AVX512DQVL-NEXT: retq 551; 552; AVX512BWVL-LABEL: var_shift_v16i8: 553; AVX512BWVL: # %bb.0: 554; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 555; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 556; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 557; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 558; AVX512BWVL-NEXT: vzeroupper 559; AVX512BWVL-NEXT: retq 560; 561; X86-SSE-LABEL: var_shift_v16i8: 562; X86-SSE: # %bb.0: 563; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 564; X86-SSE-NEXT: psllw $5, %xmm1 565; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 566; X86-SSE-NEXT: pxor %xmm3, %xmm3 567; X86-SSE-NEXT: pxor %xmm5, %xmm5 568; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 569; X86-SSE-NEXT: movdqa %xmm5, %xmm6 570; X86-SSE-NEXT: pandn %xmm2, %xmm6 571; X86-SSE-NEXT: psraw $4, %xmm2 572; X86-SSE-NEXT: pand %xmm5, %xmm2 573; X86-SSE-NEXT: por %xmm6, %xmm2 574; X86-SSE-NEXT: paddw %xmm4, %xmm4 575; X86-SSE-NEXT: pxor %xmm5, %xmm5 576; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 577; X86-SSE-NEXT: movdqa %xmm5, %xmm6 578; X86-SSE-NEXT: pandn %xmm2, %xmm6 579; X86-SSE-NEXT: psraw $2, %xmm2 580; X86-SSE-NEXT: pand %xmm5, %xmm2 581; X86-SSE-NEXT: por %xmm6, %xmm2 582; X86-SSE-NEXT: paddw %xmm4, %xmm4 583; X86-SSE-NEXT: pxor %xmm5, %xmm5 584; X86-SSE-NEXT: pcmpgtw %xmm4, %xmm5 585; X86-SSE-NEXT: movdqa %xmm5, %xmm4 586; X86-SSE-NEXT: pandn %xmm2, %xmm4 587; X86-SSE-NEXT: psraw $1, %xmm2 588; X86-SSE-NEXT: pand %xmm5, %xmm2 589; X86-SSE-NEXT: por %xmm4, %xmm2 590; X86-SSE-NEXT: psrlw $8, %xmm2 591; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 592; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 593; X86-SSE-NEXT: pxor %xmm4, %xmm4 594; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 595; X86-SSE-NEXT: movdqa %xmm4, %xmm5 596; X86-SSE-NEXT: pandn %xmm0, %xmm5 597; X86-SSE-NEXT: psraw $4, %xmm0 598; X86-SSE-NEXT: pand %xmm4, %xmm0 599; X86-SSE-NEXT: por %xmm5, %xmm0 600; X86-SSE-NEXT: paddw %xmm1, %xmm1 601; X86-SSE-NEXT: pxor %xmm4, %xmm4 602; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm4 603; X86-SSE-NEXT: movdqa %xmm4, %xmm5 604; X86-SSE-NEXT: pandn %xmm0, %xmm5 605; X86-SSE-NEXT: psraw $2, %xmm0 606; X86-SSE-NEXT: pand %xmm4, %xmm0 607; X86-SSE-NEXT: por %xmm5, %xmm0 608; X86-SSE-NEXT: paddw %xmm1, %xmm1 609; X86-SSE-NEXT: pcmpgtw %xmm1, %xmm3 610; X86-SSE-NEXT: movdqa %xmm3, %xmm1 611; X86-SSE-NEXT: pandn %xmm0, %xmm1 612; X86-SSE-NEXT: psraw $1, %xmm0 613; X86-SSE-NEXT: pand %xmm3, %xmm0 614; X86-SSE-NEXT: por %xmm1, %xmm0 615; X86-SSE-NEXT: psrlw $8, %xmm0 616; X86-SSE-NEXT: packuswb %xmm2, %xmm0 617; X86-SSE-NEXT: retl 618 %shift = ashr <16 x i8> %a, %b 619 ret <16 x i8> %shift 620} 621 622; 623; Uniform Variable Shifts 624; 625 626define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 627; SSE-LABEL: splatvar_shift_v2i64: 628; SSE: # %bb.0: 629; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 630; SSE-NEXT: psrlq %xmm1, %xmm2 631; SSE-NEXT: psrlq %xmm1, %xmm0 632; SSE-NEXT: pxor %xmm2, %xmm0 633; SSE-NEXT: psubq %xmm2, %xmm0 634; SSE-NEXT: retq 635; 636; AVX1-LABEL: splatvar_shift_v2i64: 637; AVX1: # %bb.0: 638; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 639; AVX1-NEXT: # xmm2 = mem[0,0] 640; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 641; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 642; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 643; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 644; AVX1-NEXT: retq 645; 646; AVX2-LABEL: splatvar_shift_v2i64: 647; AVX2: # %bb.0: 648; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 649; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 650; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 651; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 652; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 653; AVX2-NEXT: retq 654; 655; XOPAVX1-LABEL: splatvar_shift_v2i64: 656; XOPAVX1: # %bb.0: 657; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 658; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 659; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 660; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 661; XOPAVX1-NEXT: retq 662; 663; XOPAVX2-LABEL: splatvar_shift_v2i64: 664; XOPAVX2: # %bb.0: 665; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 666; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 667; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 668; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 669; XOPAVX2-NEXT: retq 670; 671; AVX512-LABEL: splatvar_shift_v2i64: 672; AVX512: # %bb.0: 673; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 674; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 675; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 676; AVX512-NEXT: vzeroupper 677; AVX512-NEXT: retq 678; 679; AVX512VL-LABEL: splatvar_shift_v2i64: 680; AVX512VL: # %bb.0: 681; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 682; AVX512VL-NEXT: retq 683; 684; X86-SSE-LABEL: splatvar_shift_v2i64: 685; X86-SSE: # %bb.0: 686; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 687; X86-SSE-NEXT: psrlq %xmm1, %xmm2 688; X86-SSE-NEXT: psrlq %xmm1, %xmm0 689; X86-SSE-NEXT: pxor %xmm2, %xmm0 690; X86-SSE-NEXT: psubq %xmm2, %xmm0 691; X86-SSE-NEXT: retl 692 %splat = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer 693 %shift = ashr <2 x i64> %a, %splat 694 ret <2 x i64> %shift 695} 696 697define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 698; SSE2-LABEL: splatvar_shift_v4i32: 699; SSE2: # %bb.0: 700; SSE2-NEXT: xorps %xmm2, %xmm2 701; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 702; SSE2-NEXT: psrad %xmm2, %xmm0 703; SSE2-NEXT: retq 704; 705; SSE41-LABEL: splatvar_shift_v4i32: 706; SSE41: # %bb.0: 707; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 708; SSE41-NEXT: psrad %xmm1, %xmm0 709; SSE41-NEXT: retq 710; 711; AVX-LABEL: splatvar_shift_v4i32: 712; AVX: # %bb.0: 713; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 714; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 715; AVX-NEXT: retq 716; 717; XOP-LABEL: splatvar_shift_v4i32: 718; XOP: # %bb.0: 719; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 720; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 721; XOP-NEXT: retq 722; 723; AVX512-LABEL: splatvar_shift_v4i32: 724; AVX512: # %bb.0: 725; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 726; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 727; AVX512-NEXT: retq 728; 729; AVX512VL-LABEL: splatvar_shift_v4i32: 730; AVX512VL: # %bb.0: 731; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 732; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 733; AVX512VL-NEXT: retq 734; 735; X86-SSE-LABEL: splatvar_shift_v4i32: 736; X86-SSE: # %bb.0: 737; X86-SSE-NEXT: xorps %xmm2, %xmm2 738; X86-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 739; X86-SSE-NEXT: psrad %xmm2, %xmm0 740; X86-SSE-NEXT: retl 741 %splat = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer 742 %shift = ashr <4 x i32> %a, %splat 743 ret <4 x i32> %shift 744} 745 746define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 747; SSE2-LABEL: splatvar_shift_v8i16: 748; SSE2: # %bb.0: 749; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 750; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 751; SSE2-NEXT: psraw %xmm1, %xmm0 752; SSE2-NEXT: retq 753; 754; SSE41-LABEL: splatvar_shift_v8i16: 755; SSE41: # %bb.0: 756; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 757; SSE41-NEXT: psraw %xmm1, %xmm0 758; SSE41-NEXT: retq 759; 760; AVX-LABEL: splatvar_shift_v8i16: 761; AVX: # %bb.0: 762; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 763; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 764; AVX-NEXT: retq 765; 766; XOP-LABEL: splatvar_shift_v8i16: 767; XOP: # %bb.0: 768; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 769; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 770; XOP-NEXT: retq 771; 772; AVX512-LABEL: splatvar_shift_v8i16: 773; AVX512: # %bb.0: 774; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 775; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 776; AVX512-NEXT: retq 777; 778; AVX512VL-LABEL: splatvar_shift_v8i16: 779; AVX512VL: # %bb.0: 780; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 781; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 782; AVX512VL-NEXT: retq 783; 784; X86-SSE-LABEL: splatvar_shift_v8i16: 785; X86-SSE: # %bb.0: 786; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 787; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 788; X86-SSE-NEXT: psraw %xmm1, %xmm0 789; X86-SSE-NEXT: retl 790 %splat = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer 791 %shift = ashr <8 x i16> %a, %splat 792 ret <8 x i16> %shift 793} 794 795define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 796; SSE2-LABEL: splatvar_shift_v16i8: 797; SSE2: # %bb.0: 798; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 799; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 800; SSE2-NEXT: psrlw %xmm1, %xmm0 801; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 802; SSE2-NEXT: psrlw %xmm1, %xmm2 803; SSE2-NEXT: psrlw $8, %xmm2 804; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 805; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 806; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 807; SSE2-NEXT: pand %xmm2, %xmm0 808; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 809; SSE2-NEXT: psrlw %xmm1, %xmm2 810; SSE2-NEXT: pxor %xmm2, %xmm0 811; SSE2-NEXT: psubb %xmm2, %xmm0 812; SSE2-NEXT: retq 813; 814; SSE41-LABEL: splatvar_shift_v16i8: 815; SSE41: # %bb.0: 816; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 817; SSE41-NEXT: psrlw %xmm1, %xmm0 818; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 819; SSE41-NEXT: psrlw %xmm1, %xmm2 820; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 821; SSE41-NEXT: pand %xmm2, %xmm0 822; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 823; SSE41-NEXT: psrlw %xmm1, %xmm2 824; SSE41-NEXT: pxor %xmm2, %xmm0 825; SSE41-NEXT: psubb %xmm2, %xmm0 826; SSE41-NEXT: retq 827; 828; AVX1-LABEL: splatvar_shift_v16i8: 829; AVX1: # %bb.0: 830; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 831; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 832; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 833; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 834; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 835; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 836; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 837; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 838; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 839; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 840; AVX1-NEXT: retq 841; 842; AVX2-LABEL: splatvar_shift_v16i8: 843; AVX2: # %bb.0: 844; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 845; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 846; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 847; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 848; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 849; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 850; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 851; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 852; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 853; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 854; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 855; AVX2-NEXT: retq 856; 857; XOPAVX1-LABEL: splatvar_shift_v16i8: 858; XOPAVX1: # %bb.0: 859; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 860; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 861; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 862; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 863; XOPAVX1-NEXT: retq 864; 865; XOPAVX2-LABEL: splatvar_shift_v16i8: 866; XOPAVX2: # %bb.0: 867; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 868; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 869; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 870; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 871; XOPAVX2-NEXT: retq 872; 873; AVX512DQ-LABEL: splatvar_shift_v16i8: 874; AVX512DQ: # %bb.0: 875; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 876; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 877; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 878; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 879; AVX512DQ-NEXT: vzeroupper 880; AVX512DQ-NEXT: retq 881; 882; AVX512BW-LABEL: splatvar_shift_v16i8: 883; AVX512BW: # %bb.0: 884; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 885; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 886; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 887; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 888; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 889; AVX512BW-NEXT: vzeroupper 890; AVX512BW-NEXT: retq 891; 892; AVX512DQVL-LABEL: splatvar_shift_v16i8: 893; AVX512DQVL: # %bb.0: 894; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 895; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 896; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 897; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 898; AVX512DQVL-NEXT: vzeroupper 899; AVX512DQVL-NEXT: retq 900; 901; AVX512BWVL-LABEL: splatvar_shift_v16i8: 902; AVX512BWVL: # %bb.0: 903; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 904; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 905; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 906; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 907; AVX512BWVL-NEXT: vzeroupper 908; AVX512BWVL-NEXT: retq 909; 910; X86-SSE-LABEL: splatvar_shift_v16i8: 911; X86-SSE: # %bb.0: 912; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 913; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 914; X86-SSE-NEXT: psrlw %xmm1, %xmm0 915; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 916; X86-SSE-NEXT: psrlw %xmm1, %xmm2 917; X86-SSE-NEXT: psrlw $8, %xmm2 918; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 919; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 920; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 921; X86-SSE-NEXT: pand %xmm2, %xmm0 922; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 923; X86-SSE-NEXT: psrlw %xmm1, %xmm2 924; X86-SSE-NEXT: pxor %xmm2, %xmm0 925; X86-SSE-NEXT: psubb %xmm2, %xmm0 926; X86-SSE-NEXT: retl 927 %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer 928 %shift = ashr <16 x i8> %a, %splat 929 ret <16 x i8> %shift 930} 931 932; 933; Uniform Variable Modulo Shifts 934; 935 936define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 937; SSE-LABEL: splatvar_modulo_shift_v2i64: 938; SSE: # %bb.0: 939; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 940; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 941; SSE-NEXT: psrlq %xmm1, %xmm2 942; SSE-NEXT: psrlq %xmm1, %xmm0 943; SSE-NEXT: pxor %xmm2, %xmm0 944; SSE-NEXT: psubq %xmm2, %xmm0 945; SSE-NEXT: retq 946; 947; AVX1-LABEL: splatvar_modulo_shift_v2i64: 948; AVX1: # %bb.0: 949; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 950; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 951; AVX1-NEXT: # xmm2 = mem[0,0] 952; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 953; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 954; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 955; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 956; AVX1-NEXT: retq 957; 958; AVX2-LABEL: splatvar_modulo_shift_v2i64: 959; AVX2: # %bb.0: 960; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 961; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 962; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 963; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 964; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 965; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 966; AVX2-NEXT: retq 967; 968; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: 969; XOPAVX1: # %bb.0: 970; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 971; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 972; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 973; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 974; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 975; XOPAVX1-NEXT: retq 976; 977; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: 978; XOPAVX2: # %bb.0: 979; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 980; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 981; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 982; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 983; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 984; XOPAVX2-NEXT: retq 985; 986; AVX512-LABEL: splatvar_modulo_shift_v2i64: 987; AVX512: # %bb.0: 988; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 989; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 990; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 991; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 992; AVX512-NEXT: vzeroupper 993; AVX512-NEXT: retq 994; 995; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: 996; AVX512VL: # %bb.0: 997; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 998; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 999; AVX512VL-NEXT: retq 1000; 1001; X86-SSE-LABEL: splatvar_modulo_shift_v2i64: 1002; X86-SSE: # %bb.0: 1003; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1004; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 1005; X86-SSE-NEXT: psrlq %xmm1, %xmm2 1006; X86-SSE-NEXT: psrlq %xmm1, %xmm0 1007; X86-SSE-NEXT: pxor %xmm2, %xmm0 1008; X86-SSE-NEXT: psubq %xmm2, %xmm0 1009; X86-SSE-NEXT: retl 1010 %mod = and <2 x i64> %b, <i64 63, i64 63> 1011 %splat = shufflevector <2 x i64> %mod, <2 x i64> poison, <2 x i32> zeroinitializer 1012 %shift = ashr <2 x i64> %a, %splat 1013 ret <2 x i64> %shift 1014} 1015 1016define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 1017; SSE-LABEL: splatvar_modulo_shift_v4i32: 1018; SSE: # %bb.0: 1019; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1020; SSE-NEXT: psrad %xmm1, %xmm0 1021; SSE-NEXT: retq 1022; 1023; AVX-LABEL: splatvar_modulo_shift_v4i32: 1024; AVX: # %bb.0: 1025; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1026; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1027; AVX-NEXT: retq 1028; 1029; XOP-LABEL: splatvar_modulo_shift_v4i32: 1030; XOP: # %bb.0: 1031; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1032; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1033; XOP-NEXT: retq 1034; 1035; AVX512-LABEL: splatvar_modulo_shift_v4i32: 1036; AVX512: # %bb.0: 1037; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1038; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1039; AVX512-NEXT: retq 1040; 1041; AVX512VL-LABEL: splatvar_modulo_shift_v4i32: 1042; AVX512VL: # %bb.0: 1043; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1044; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1045; AVX512VL-NEXT: retq 1046; 1047; X86-SSE-LABEL: splatvar_modulo_shift_v4i32: 1048; X86-SSE: # %bb.0: 1049; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1050; X86-SSE-NEXT: psrad %xmm1, %xmm0 1051; X86-SSE-NEXT: retl 1052 %mod = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31> 1053 %splat = shufflevector <4 x i32> %mod, <4 x i32> poison, <4 x i32> zeroinitializer 1054 %shift = ashr <4 x i32> %a, %splat 1055 ret <4 x i32> %shift 1056} 1057 1058define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 1059; SSE-LABEL: splatvar_modulo_shift_v8i16: 1060; SSE: # %bb.0: 1061; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1062; SSE-NEXT: psraw %xmm1, %xmm0 1063; SSE-NEXT: retq 1064; 1065; AVX-LABEL: splatvar_modulo_shift_v8i16: 1066; AVX: # %bb.0: 1067; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1068; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1069; AVX-NEXT: retq 1070; 1071; XOP-LABEL: splatvar_modulo_shift_v8i16: 1072; XOP: # %bb.0: 1073; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1074; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1075; XOP-NEXT: retq 1076; 1077; AVX512-LABEL: splatvar_modulo_shift_v8i16: 1078; AVX512: # %bb.0: 1079; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1080; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1081; AVX512-NEXT: retq 1082; 1083; AVX512VL-LABEL: splatvar_modulo_shift_v8i16: 1084; AVX512VL: # %bb.0: 1085; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1086; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1087; AVX512VL-NEXT: retq 1088; 1089; X86-SSE-LABEL: splatvar_modulo_shift_v8i16: 1090; X86-SSE: # %bb.0: 1091; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1092; X86-SSE-NEXT: psraw %xmm1, %xmm0 1093; X86-SSE-NEXT: retl 1094 %mod = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 1095 %splat = shufflevector <8 x i16> %mod, <8 x i16> poison, <8 x i32> zeroinitializer 1096 %shift = ashr <8 x i16> %a, %splat 1097 ret <8 x i16> %shift 1098} 1099 1100define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 1101; SSE2-LABEL: splatvar_modulo_shift_v16i8: 1102; SSE2: # %bb.0: 1103; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1104; SSE2-NEXT: psrlw %xmm1, %xmm0 1105; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1106; SSE2-NEXT: psrlw %xmm1, %xmm2 1107; SSE2-NEXT: psrlw $8, %xmm2 1108; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1109; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1110; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1111; SSE2-NEXT: pand %xmm2, %xmm0 1112; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1113; SSE2-NEXT: psrlw %xmm1, %xmm2 1114; SSE2-NEXT: pxor %xmm2, %xmm0 1115; SSE2-NEXT: psubb %xmm2, %xmm0 1116; SSE2-NEXT: retq 1117; 1118; SSE41-LABEL: splatvar_modulo_shift_v16i8: 1119; SSE41: # %bb.0: 1120; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1121; SSE41-NEXT: psrlw %xmm1, %xmm0 1122; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 1123; SSE41-NEXT: psrlw %xmm1, %xmm2 1124; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1125; SSE41-NEXT: pand %xmm2, %xmm0 1126; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1127; SSE41-NEXT: psrlw %xmm1, %xmm2 1128; SSE41-NEXT: pxor %xmm2, %xmm0 1129; SSE41-NEXT: psubb %xmm2, %xmm0 1130; SSE41-NEXT: retq 1131; 1132; AVX1-LABEL: splatvar_modulo_shift_v16i8: 1133; AVX1: # %bb.0: 1134; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1135; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1136; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1137; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1138; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1139; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1140; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1141; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1142; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1143; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1144; AVX1-NEXT: retq 1145; 1146; AVX2-LABEL: splatvar_modulo_shift_v16i8: 1147; AVX2: # %bb.0: 1148; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1149; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1150; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1151; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1152; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1153; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1154; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 1155; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1156; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1157; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1158; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1159; AVX2-NEXT: retq 1160; 1161; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: 1162; XOPAVX1: # %bb.0: 1163; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1164; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1165; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1166; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1167; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1168; XOPAVX1-NEXT: retq 1169; 1170; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: 1171; XOPAVX2: # %bb.0: 1172; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1173; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1174; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1175; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1176; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1177; XOPAVX2-NEXT: retq 1178; 1179; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: 1180; AVX512DQ: # %bb.0: 1181; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1182; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1183; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1184; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1185; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1186; AVX512DQ-NEXT: vzeroupper 1187; AVX512DQ-NEXT: retq 1188; 1189; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: 1190; AVX512BW: # %bb.0: 1191; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1192; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1193; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1194; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1195; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1196; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1197; AVX512BW-NEXT: vzeroupper 1198; AVX512BW-NEXT: retq 1199; 1200; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: 1201; AVX512DQVL: # %bb.0: 1202; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1203; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1204; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1205; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 1206; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1207; AVX512DQVL-NEXT: vzeroupper 1208; AVX512DQVL-NEXT: retq 1209; 1210; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: 1211; AVX512BWVL: # %bb.0: 1212; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1213; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1214; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1215; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1216; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1217; AVX512BWVL-NEXT: vzeroupper 1218; AVX512BWVL-NEXT: retq 1219; 1220; X86-SSE-LABEL: splatvar_modulo_shift_v16i8: 1221; X86-SSE: # %bb.0: 1222; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1223; X86-SSE-NEXT: psrlw %xmm1, %xmm0 1224; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 1225; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1226; X86-SSE-NEXT: psrlw $8, %xmm2 1227; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1228; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 1229; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1230; X86-SSE-NEXT: pand %xmm2, %xmm0 1231; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] 1232; X86-SSE-NEXT: psrlw %xmm1, %xmm2 1233; X86-SSE-NEXT: pxor %xmm2, %xmm0 1234; X86-SSE-NEXT: psubb %xmm2, %xmm0 1235; X86-SSE-NEXT: retl 1236 %mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 1237 %splat = shufflevector <16 x i8> %mod, <16 x i8> poison, <16 x i32> zeroinitializer 1238 %shift = ashr <16 x i8> %a, %splat 1239 ret <16 x i8> %shift 1240} 1241 1242; 1243; Constant Shifts 1244; 1245 1246define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 1247; SSE2-LABEL: constant_shift_v2i64: 1248; SSE2: # %bb.0: 1249; SSE2-NEXT: movdqa %xmm0, %xmm1 1250; SSE2-NEXT: psrlq $1, %xmm1 1251; SSE2-NEXT: psrlq $7, %xmm0 1252; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1253; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1254; SSE2-NEXT: xorpd %xmm1, %xmm0 1255; SSE2-NEXT: psubq %xmm1, %xmm0 1256; SSE2-NEXT: retq 1257; 1258; SSE41-LABEL: constant_shift_v2i64: 1259; SSE41: # %bb.0: 1260; SSE41-NEXT: movdqa %xmm0, %xmm1 1261; SSE41-NEXT: psrlq $7, %xmm1 1262; SSE41-NEXT: psrlq $1, %xmm0 1263; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1264; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1265; SSE41-NEXT: pxor %xmm1, %xmm0 1266; SSE41-NEXT: psubq %xmm1, %xmm0 1267; SSE41-NEXT: retq 1268; 1269; AVX1-LABEL: constant_shift_v2i64: 1270; AVX1: # %bb.0: 1271; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1272; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1273; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1274; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1275; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1276; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1277; AVX1-NEXT: retq 1278; 1279; AVX2-LABEL: constant_shift_v2i64: 1280; AVX2: # %bb.0: 1281; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1282; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1283; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1284; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1285; AVX2-NEXT: retq 1286; 1287; XOP-LABEL: constant_shift_v2i64: 1288; XOP: # %bb.0: 1289; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1290; XOP-NEXT: retq 1291; 1292; AVX512-LABEL: constant_shift_v2i64: 1293; AVX512: # %bb.0: 1294; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1295; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,7] 1296; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 1297; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1298; AVX512-NEXT: vzeroupper 1299; AVX512-NEXT: retq 1300; 1301; AVX512VL-LABEL: constant_shift_v2i64: 1302; AVX512VL: # %bb.0: 1303; AVX512VL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1304; AVX512VL-NEXT: retq 1305; 1306; X86-SSE-LABEL: constant_shift_v2i64: 1307; X86-SSE: # %bb.0: 1308; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1309; X86-SSE-NEXT: psrlq $1, %xmm1 1310; X86-SSE-NEXT: psrlq $7, %xmm0 1311; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1312; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1073741824,0,16777216] 1313; X86-SSE-NEXT: xorpd %xmm1, %xmm0 1314; X86-SSE-NEXT: psubq %xmm1, %xmm0 1315; X86-SSE-NEXT: retl 1316 %shift = ashr <2 x i64> %a, <i64 1, i64 7> 1317 ret <2 x i64> %shift 1318} 1319 1320define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 1321; SSE2-LABEL: constant_shift_v4i32: 1322; SSE2: # %bb.0: 1323; SSE2-NEXT: movdqa %xmm0, %xmm1 1324; SSE2-NEXT: psrad $7, %xmm1 1325; SSE2-NEXT: movdqa %xmm0, %xmm2 1326; SSE2-NEXT: psrad $6, %xmm2 1327; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1328; SSE2-NEXT: movdqa %xmm0, %xmm1 1329; SSE2-NEXT: psrad $5, %xmm1 1330; SSE2-NEXT: psrad $4, %xmm0 1331; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1332; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1333; SSE2-NEXT: retq 1334; 1335; SSE41-LABEL: constant_shift_v4i32: 1336; SSE41: # %bb.0: 1337; SSE41-NEXT: movdqa %xmm0, %xmm1 1338; SSE41-NEXT: psrad $7, %xmm1 1339; SSE41-NEXT: movdqa %xmm0, %xmm2 1340; SSE41-NEXT: psrad $5, %xmm2 1341; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1342; SSE41-NEXT: movdqa %xmm0, %xmm1 1343; SSE41-NEXT: psrad $6, %xmm1 1344; SSE41-NEXT: psrad $4, %xmm0 1345; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1346; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1347; SSE41-NEXT: retq 1348; 1349; AVX1-LABEL: constant_shift_v4i32: 1350; AVX1: # %bb.0: 1351; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1352; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1353; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1354; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1355; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 1356; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1357; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1358; AVX1-NEXT: retq 1359; 1360; AVX2-LABEL: constant_shift_v4i32: 1361; AVX2: # %bb.0: 1362; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1363; AVX2-NEXT: retq 1364; 1365; XOPAVX1-LABEL: constant_shift_v4i32: 1366; XOPAVX1: # %bb.0: 1367; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1368; XOPAVX1-NEXT: retq 1369; 1370; XOPAVX2-LABEL: constant_shift_v4i32: 1371; XOPAVX2: # %bb.0: 1372; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1373; XOPAVX2-NEXT: retq 1374; 1375; AVX512-LABEL: constant_shift_v4i32: 1376; AVX512: # %bb.0: 1377; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1378; AVX512-NEXT: retq 1379; 1380; AVX512VL-LABEL: constant_shift_v4i32: 1381; AVX512VL: # %bb.0: 1382; AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1383; AVX512VL-NEXT: retq 1384; 1385; X86-SSE-LABEL: constant_shift_v4i32: 1386; X86-SSE: # %bb.0: 1387; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1388; X86-SSE-NEXT: psrad $7, %xmm1 1389; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1390; X86-SSE-NEXT: psrad $6, %xmm2 1391; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1392; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1393; X86-SSE-NEXT: psrad $5, %xmm1 1394; X86-SSE-NEXT: psrad $4, %xmm0 1395; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1396; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1397; X86-SSE-NEXT: retl 1398 %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1399 ret <4 x i32> %shift 1400} 1401 1402define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 1403; SSE2-LABEL: constant_shift_v8i16: 1404; SSE2: # %bb.0: 1405; SSE2-NEXT: movdqa %xmm0, %xmm1 1406; SSE2-NEXT: psraw $4, %xmm1 1407; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1408; SSE2-NEXT: movapd %xmm1, %xmm2 1409; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 1410; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 1411; SSE2-NEXT: psraw $2, %xmm1 1412; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1413; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 1414; SSE2-NEXT: movaps %xmm2, %xmm1 1415; SSE2-NEXT: andps %xmm0, %xmm1 1416; SSE2-NEXT: psraw $1, %xmm2 1417; SSE2-NEXT: andnps %xmm2, %xmm0 1418; SSE2-NEXT: orps %xmm1, %xmm0 1419; SSE2-NEXT: retq 1420; 1421; SSE41-LABEL: constant_shift_v8i16: 1422; SSE41: # %bb.0: 1423; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,16384,8192,4096,2048,1024,512] 1424; SSE41-NEXT: pmulhw %xmm0, %xmm1 1425; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1426; SSE41-NEXT: psraw $1, %xmm0 1427; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1428; SSE41-NEXT: retq 1429; 1430; AVX-LABEL: constant_shift_v8i16: 1431; AVX: # %bb.0: 1432; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,16384,8192,4096,2048,1024,512] 1433; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1434; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 1435; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] 1436; AVX-NEXT: retq 1437; 1438; XOP-LABEL: constant_shift_v8i16: 1439; XOP: # %bb.0: 1440; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1441; XOP-NEXT: retq 1442; 1443; AVX512DQ-LABEL: constant_shift_v8i16: 1444; AVX512DQ: # %bb.0: 1445; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 1446; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1447; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1448; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1449; AVX512DQ-NEXT: vzeroupper 1450; AVX512DQ-NEXT: retq 1451; 1452; AVX512BW-LABEL: constant_shift_v8i16: 1453; AVX512BW: # %bb.0: 1454; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1455; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1456; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1457; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1458; AVX512BW-NEXT: vzeroupper 1459; AVX512BW-NEXT: retq 1460; 1461; AVX512DQVL-LABEL: constant_shift_v8i16: 1462; AVX512DQVL: # %bb.0: 1463; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 1464; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1465; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 1466; AVX512DQVL-NEXT: vzeroupper 1467; AVX512DQVL-NEXT: retq 1468; 1469; AVX512BWVL-LABEL: constant_shift_v8i16: 1470; AVX512BWVL: # %bb.0: 1471; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1472; AVX512BWVL-NEXT: retq 1473; 1474; X86-SSE-LABEL: constant_shift_v8i16: 1475; X86-SSE: # %bb.0: 1476; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1477; X86-SSE-NEXT: psraw $4, %xmm1 1478; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1479; X86-SSE-NEXT: movapd %xmm1, %xmm2 1480; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 1481; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 1482; X86-SSE-NEXT: psraw $2, %xmm1 1483; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1484; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 1485; X86-SSE-NEXT: movaps %xmm2, %xmm1 1486; X86-SSE-NEXT: andps %xmm0, %xmm1 1487; X86-SSE-NEXT: psraw $1, %xmm2 1488; X86-SSE-NEXT: andnps %xmm2, %xmm0 1489; X86-SSE-NEXT: orps %xmm1, %xmm0 1490; X86-SSE-NEXT: retl 1491 %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1492 ret <8 x i16> %shift 1493} 1494 1495define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind { 1496; SSE2-LABEL: constant_shift_v8i16_pairs: 1497; SSE2: # %bb.0: 1498; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [512,512,u,u,8192,8192,1024,1024] 1499; SSE2-NEXT: pmulhw %xmm0, %xmm1 1500; SSE2-NEXT: psraw $1, %xmm0 1501; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1502; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1503; SSE2-NEXT: retq 1504; 1505; SSE41-LABEL: constant_shift_v8i16_pairs: 1506; SSE41: # %bb.0: 1507; SSE41-NEXT: movdqa %xmm0, %xmm1 1508; SSE41-NEXT: psraw $1, %xmm1 1509; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,512,u,u,8192,8192,1024,1024] 1510; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1511; SSE41-NEXT: retq 1512; 1513; AVX1-LABEL: constant_shift_v8i16_pairs: 1514; AVX1: # %bb.0: 1515; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 1516; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,512,u,u,8192,8192,1024,1024] 1517; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1518; AVX1-NEXT: retq 1519; 1520; AVX2-LABEL: constant_shift_v8i16_pairs: 1521; AVX2: # %bb.0: 1522; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1523; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1524; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512] 1525; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1526; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1527; AVX2-NEXT: retq 1528; 1529; XOP-LABEL: constant_shift_v8i16_pairs: 1530; XOP: # %bb.0: 1531; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1532; XOP-NEXT: retq 1533; 1534; AVX512DQ-LABEL: constant_shift_v8i16_pairs: 1535; AVX512DQ: # %bb.0: 1536; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1537; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1538; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512] 1539; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 1540; AVX512DQ-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1541; AVX512DQ-NEXT: retq 1542; 1543; AVX512BW-LABEL: constant_shift_v8i16_pairs: 1544; AVX512BW: # %bb.0: 1545; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1546; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,7,1,1,3,3,6,6] 1547; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1548; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1549; AVX512BW-NEXT: vzeroupper 1550; AVX512BW-NEXT: retq 1551; 1552; AVX512DQVL-LABEL: constant_shift_v8i16_pairs: 1553; AVX512DQVL: # %bb.0: 1554; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1555; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512] 1556; AVX512DQVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 1557; AVX512DQVL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1558; AVX512DQVL-NEXT: retq 1559; 1560; AVX512BWVL-LABEL: constant_shift_v8i16_pairs: 1561; AVX512BWVL: # %bb.0: 1562; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1563; AVX512BWVL-NEXT: retq 1564; 1565; X86-SSE-LABEL: constant_shift_v8i16_pairs: 1566; X86-SSE: # %bb.0: 1567; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [512,512,u,u,8192,8192,1024,1024] 1568; X86-SSE-NEXT: pmulhw %xmm0, %xmm1 1569; X86-SSE-NEXT: psraw $1, %xmm0 1570; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1571; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1572; X86-SSE-NEXT: retl 1573 %shift = ashr <8 x i16> %a, <i16 7, i16 7, i16 1, i16 1, i16 3, i16 3, i16 6, i16 6> 1574 ret <8 x i16> %shift 1575} 1576 1577define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 1578; SSE-LABEL: constant_shift_v16i8: 1579; SSE: # %bb.0: 1580; SSE-NEXT: movdqa %xmm0, %xmm1 1581; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1582; SSE-NEXT: psraw $8, %xmm1 1583; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256] 1584; SSE-NEXT: psrlw $8, %xmm1 1585; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1586; SSE-NEXT: psraw $8, %xmm0 1587; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] 1588; SSE-NEXT: psrlw $8, %xmm0 1589; SSE-NEXT: packuswb %xmm1, %xmm0 1590; SSE-NEXT: retq 1591; 1592; AVX1-LABEL: constant_shift_v16i8: 1593; AVX1: # %bb.0: 1594; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1595; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1596; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] 1597; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1598; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1599; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1600; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1601; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1602; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1603; AVX1-NEXT: retq 1604; 1605; AVX2-LABEL: constant_shift_v16i8: 1606; AVX2: # %bb.0: 1607; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1608; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,2,4,8,16,32,64,128,256] 1609; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1610; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1611; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1612; AVX2-NEXT: vzeroupper 1613; AVX2-NEXT: retq 1614; 1615; XOP-LABEL: constant_shift_v16i8: 1616; XOP: # %bb.0: 1617; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1618; XOP-NEXT: retq 1619; 1620; AVX512DQ-LABEL: constant_shift_v16i8: 1621; AVX512DQ: # %bb.0: 1622; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1623; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1624; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1625; AVX512DQ-NEXT: vzeroupper 1626; AVX512DQ-NEXT: retq 1627; 1628; AVX512BW-LABEL: constant_shift_v16i8: 1629; AVX512BW: # %bb.0: 1630; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1631; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1632; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1633; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1634; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1635; AVX512BW-NEXT: vzeroupper 1636; AVX512BW-NEXT: retq 1637; 1638; AVX512DQVL-LABEL: constant_shift_v16i8: 1639; AVX512DQVL: # %bb.0: 1640; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1641; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1642; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1643; AVX512DQVL-NEXT: vzeroupper 1644; AVX512DQVL-NEXT: retq 1645; 1646; AVX512BWVL-LABEL: constant_shift_v16i8: 1647; AVX512BWVL: # %bb.0: 1648; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1649; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1650; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1651; AVX512BWVL-NEXT: vzeroupper 1652; AVX512BWVL-NEXT: retq 1653; 1654; X86-SSE-LABEL: constant_shift_v16i8: 1655; X86-SSE: # %bb.0: 1656; X86-SSE-NEXT: movdqa %xmm0, %xmm1 1657; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1658; X86-SSE-NEXT: psraw $8, %xmm1 1659; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,4,8,16,32,64,128,256] 1660; X86-SSE-NEXT: psrlw $8, %xmm1 1661; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1662; X86-SSE-NEXT: psraw $8, %xmm0 1663; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] 1664; X86-SSE-NEXT: psrlw $8, %xmm0 1665; X86-SSE-NEXT: packuswb %xmm1, %xmm0 1666; X86-SSE-NEXT: retl 1667 %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1668 ret <16 x i8> %shift 1669} 1670 1671define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { 1672; SSE2-LABEL: constant_shift_v16i8_pairs: 1673; SSE2: # %bb.0: 1674; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] 1675; SSE2-NEXT: pandn %xmm0, %xmm1 1676; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1677; SSE2-NEXT: por %xmm1, %xmm0 1678; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1679; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1680; SSE2-NEXT: pxor %xmm1, %xmm0 1681; SSE2-NEXT: psubb %xmm1, %xmm0 1682; SSE2-NEXT: retq 1683; 1684; SSE41-LABEL: constant_shift_v16i8_pairs: 1685; SSE41: # %bb.0: 1686; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32768,4096,512,8192,16384,u,2048,1024] 1687; SSE41-NEXT: pmulhuw %xmm0, %xmm1 1688; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 1689; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1690; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1691; SSE41-NEXT: pxor %xmm1, %xmm0 1692; SSE41-NEXT: psubb %xmm1, %xmm0 1693; SSE41-NEXT: retq 1694; 1695; AVX-LABEL: constant_shift_v16i8_pairs: 1696; AVX: # %bb.0: 1697; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024] 1698; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 1699; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1700; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1701; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 1702; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1703; AVX-NEXT: retq 1704; 1705; XOP-LABEL: constant_shift_v16i8_pairs: 1706; XOP: # %bb.0: 1707; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1708; XOP-NEXT: retq 1709; 1710; AVX512DQ-LABEL: constant_shift_v16i8_pairs: 1711; AVX512DQ: # %bb.0: 1712; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024] 1713; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 1714; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1715; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1716; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 1717; AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1718; AVX512DQ-NEXT: retq 1719; 1720; AVX512BW-LABEL: constant_shift_v16i8_pairs: 1721; AVX512BW: # %bb.0: 1722; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1723; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,4,7,3,2,0,5,6] 1724; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1725; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1726; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1727; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 1728; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1729; AVX512BW-NEXT: vzeroupper 1730; AVX512BW-NEXT: retq 1731; 1732; AVX512DQVL-LABEL: constant_shift_v16i8_pairs: 1733; AVX512DQVL: # %bb.0: 1734; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024] 1735; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] 1736; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1737; AVX512DQVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 1738; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1739; AVX512DQVL-NEXT: retq 1740; 1741; AVX512BWVL-LABEL: constant_shift_v16i8_pairs: 1742; AVX512BWVL: # %bb.0: 1743; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1744; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1745; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 1746; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1747; AVX512BWVL-NEXT: retq 1748; 1749; X86-SSE-LABEL: constant_shift_v16i8_pairs: 1750; X86-SSE: # %bb.0: 1751; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] 1752; X86-SSE-NEXT: pandn %xmm0, %xmm1 1753; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1754; X86-SSE-NEXT: por %xmm1, %xmm0 1755; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1756; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] 1757; X86-SSE-NEXT: pxor %xmm1, %xmm0 1758; X86-SSE-NEXT: psubb %xmm1, %xmm0 1759; X86-SSE-NEXT: retl 1760 %shift = ashr <16 x i8> %a, <i8 1, i8 1, i8 4, i8 4, i8 7, i8 7, i8 3, i8 3, i8 2, i8 2, i8 0, i8 0, i8 5, i8 5, i8 6, i8 6> 1761 ret <16 x i8> %shift 1762} 1763 1764define <16 x i8> @constant_shift_v16i8_quads(<16 x i8> %a) nounwind { 1765; SSE2-LABEL: constant_shift_v16i8_quads: 1766; SSE2: # %bb.0: 1767; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384] 1768; SSE2-NEXT: pmulhuw %xmm0, %xmm1 1769; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1770; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1771; SSE2-NEXT: movaps {{.*#+}} xmm0 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1772; SSE2-NEXT: xorps %xmm0, %xmm1 1773; SSE2-NEXT: psubb %xmm0, %xmm1 1774; SSE2-NEXT: movdqa %xmm1, %xmm0 1775; SSE2-NEXT: retq 1776; 1777; SSE41-LABEL: constant_shift_v16i8_quads: 1778; SSE41: # %bb.0: 1779; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384] 1780; SSE41-NEXT: pmulhuw %xmm0, %xmm1 1781; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1782; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1783; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1784; SSE41-NEXT: pxor %xmm1, %xmm0 1785; SSE41-NEXT: psubb %xmm1, %xmm0 1786; SSE41-NEXT: retq 1787; 1788; AVX1-LABEL: constant_shift_v16i8_quads: 1789; AVX1: # %bb.0: 1790; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,32768,32768,8192,8192,16384,16384] 1791; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1792; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1793; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1794; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1795; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1796; AVX1-NEXT: retq 1797; 1798; AVX2-LABEL: constant_shift_v16i8_quads: 1799; AVX2: # %bb.0: 1800; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1801; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1802; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1803; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1804; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1805; AVX2-NEXT: retq 1806; 1807; XOP-LABEL: constant_shift_v16i8_quads: 1808; XOP: # %bb.0: 1809; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1810; XOP-NEXT: retq 1811; 1812; AVX512-LABEL: constant_shift_v16i8_quads: 1813; AVX512: # %bb.0: 1814; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1815; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1816; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1817; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 1818; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1819; AVX512-NEXT: retq 1820; 1821; AVX512VL-LABEL: constant_shift_v16i8_quads: 1822; AVX512VL: # %bb.0: 1823; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1824; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1825; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 1826; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1827; AVX512VL-NEXT: retq 1828; 1829; X86-SSE-LABEL: constant_shift_v16i8_quads: 1830; X86-SSE: # %bb.0: 1831; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384] 1832; X86-SSE-NEXT: pmulhuw %xmm0, %xmm1 1833; X86-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1834; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1835; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1836; X86-SSE-NEXT: xorps %xmm0, %xmm1 1837; X86-SSE-NEXT: psubb %xmm0, %xmm1 1838; X86-SSE-NEXT: movdqa %xmm1, %xmm0 1839; X86-SSE-NEXT: retl 1840 %shift = ashr <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2> 1841 ret <16 x i8> %shift 1842} 1843 1844; 1845; Uniform Constant Shifts 1846; 1847 1848define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1849; SSE2-LABEL: splatconstant_shift_v2i64: 1850; SSE2: # %bb.0: 1851; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 1852; SSE2-NEXT: psrad $7, %xmm1 1853; SSE2-NEXT: psrlq $7, %xmm0 1854; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1855; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1856; SSE2-NEXT: retq 1857; 1858; SSE41-LABEL: splatconstant_shift_v2i64: 1859; SSE41: # %bb.0: 1860; SSE41-NEXT: movdqa %xmm0, %xmm1 1861; SSE41-NEXT: psrad $7, %xmm1 1862; SSE41-NEXT: psrlq $7, %xmm0 1863; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1864; SSE41-NEXT: retq 1865; 1866; AVX1-LABEL: splatconstant_shift_v2i64: 1867; AVX1: # %bb.0: 1868; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1869; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1870; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1871; AVX1-NEXT: retq 1872; 1873; AVX2-LABEL: splatconstant_shift_v2i64: 1874; AVX2: # %bb.0: 1875; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1 1876; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0 1877; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1878; AVX2-NEXT: retq 1879; 1880; XOP-LABEL: splatconstant_shift_v2i64: 1881; XOP: # %bb.0: 1882; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1883; XOP-NEXT: retq 1884; 1885; AVX512-LABEL: splatconstant_shift_v2i64: 1886; AVX512: # %bb.0: 1887; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1888; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 1889; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1890; AVX512-NEXT: vzeroupper 1891; AVX512-NEXT: retq 1892; 1893; AVX512VL-LABEL: splatconstant_shift_v2i64: 1894; AVX512VL: # %bb.0: 1895; AVX512VL-NEXT: vpsraq $7, %xmm0, %xmm0 1896; AVX512VL-NEXT: retq 1897; 1898; X86-SSE-LABEL: splatconstant_shift_v2i64: 1899; X86-SSE: # %bb.0: 1900; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 1901; X86-SSE-NEXT: psrad $7, %xmm1 1902; X86-SSE-NEXT: psrlq $7, %xmm0 1903; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1904; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1905; X86-SSE-NEXT: retl 1906 %shift = ashr <2 x i64> %a, <i64 7, i64 7> 1907 ret <2 x i64> %shift 1908} 1909 1910define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1911; SSE-LABEL: splatconstant_shift_v4i32: 1912; SSE: # %bb.0: 1913; SSE-NEXT: psrad $5, %xmm0 1914; SSE-NEXT: retq 1915; 1916; AVX-LABEL: splatconstant_shift_v4i32: 1917; AVX: # %bb.0: 1918; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 1919; AVX-NEXT: retq 1920; 1921; XOP-LABEL: splatconstant_shift_v4i32: 1922; XOP: # %bb.0: 1923; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 1924; XOP-NEXT: retq 1925; 1926; AVX512-LABEL: splatconstant_shift_v4i32: 1927; AVX512: # %bb.0: 1928; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 1929; AVX512-NEXT: retq 1930; 1931; AVX512VL-LABEL: splatconstant_shift_v4i32: 1932; AVX512VL: # %bb.0: 1933; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 1934; AVX512VL-NEXT: retq 1935; 1936; X86-SSE-LABEL: splatconstant_shift_v4i32: 1937; X86-SSE: # %bb.0: 1938; X86-SSE-NEXT: psrad $5, %xmm0 1939; X86-SSE-NEXT: retl 1940 %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1941 ret <4 x i32> %shift 1942} 1943 1944define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1945; SSE-LABEL: splatconstant_shift_v8i16: 1946; SSE: # %bb.0: 1947; SSE-NEXT: psraw $3, %xmm0 1948; SSE-NEXT: retq 1949; 1950; AVX-LABEL: splatconstant_shift_v8i16: 1951; AVX: # %bb.0: 1952; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 1953; AVX-NEXT: retq 1954; 1955; XOP-LABEL: splatconstant_shift_v8i16: 1956; XOP: # %bb.0: 1957; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 1958; XOP-NEXT: retq 1959; 1960; AVX512-LABEL: splatconstant_shift_v8i16: 1961; AVX512: # %bb.0: 1962; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 1963; AVX512-NEXT: retq 1964; 1965; AVX512VL-LABEL: splatconstant_shift_v8i16: 1966; AVX512VL: # %bb.0: 1967; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 1968; AVX512VL-NEXT: retq 1969; 1970; X86-SSE-LABEL: splatconstant_shift_v8i16: 1971; X86-SSE: # %bb.0: 1972; X86-SSE-NEXT: psraw $3, %xmm0 1973; X86-SSE-NEXT: retl 1974 %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1975 ret <8 x i16> %shift 1976} 1977 1978define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1979; SSE-LABEL: splatconstant_shift_v16i8: 1980; SSE: # %bb.0: 1981; SSE-NEXT: psrlw $3, %xmm0 1982; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1983; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1984; SSE-NEXT: pxor %xmm1, %xmm0 1985; SSE-NEXT: psubb %xmm1, %xmm0 1986; SSE-NEXT: retq 1987; 1988; AVX1-LABEL: splatconstant_shift_v16i8: 1989; AVX1: # %bb.0: 1990; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1991; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1992; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1993; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1994; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1995; AVX1-NEXT: retq 1996; 1997; AVX2-LABEL: splatconstant_shift_v16i8: 1998; AVX2: # %bb.0: 1999; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 2000; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2001; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2002; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2003; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2004; AVX2-NEXT: retq 2005; 2006; XOP-LABEL: splatconstant_shift_v16i8: 2007; XOP: # %bb.0: 2008; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2009; XOP-NEXT: retq 2010; 2011; AVX512-LABEL: splatconstant_shift_v16i8: 2012; AVX512: # %bb.0: 2013; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 2014; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2015; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2016; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 2017; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2018; AVX512-NEXT: retq 2019; 2020; AVX512DQVL-LABEL: splatconstant_shift_v16i8: 2021; AVX512DQVL: # %bb.0: 2022; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2023; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2024; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2025; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2026; AVX512DQVL-NEXT: retq 2027; 2028; AVX512BWVL-LABEL: splatconstant_shift_v16i8: 2029; AVX512BWVL: # %bb.0: 2030; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 2031; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2032; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) 2033; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2034; AVX512BWVL-NEXT: retq 2035; 2036; X86-SSE-LABEL: splatconstant_shift_v16i8: 2037; X86-SSE: # %bb.0: 2038; X86-SSE-NEXT: psrlw $3, %xmm0 2039; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2040; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2041; X86-SSE-NEXT: pxor %xmm1, %xmm0 2042; X86-SSE-NEXT: psubb %xmm1, %xmm0 2043; X86-SSE-NEXT: retl 2044 %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 2045 ret <16 x i8> %shift 2046} 2047 2048define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) { 2049; SSE-LABEL: PR52719: 2050; SSE: # %bb.0: 2051; SSE-NEXT: movd %edi, %xmm1 2052; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 2053; SSE-NEXT: psrlq %xmm1, %xmm2 2054; SSE-NEXT: psrlq %xmm1, %xmm0 2055; SSE-NEXT: pxor %xmm2, %xmm0 2056; SSE-NEXT: psubq %xmm2, %xmm0 2057; SSE-NEXT: retq 2058; 2059; AVX1-LABEL: PR52719: 2060; AVX1: # %bb.0: 2061; AVX1-NEXT: vmovd %edi, %xmm1 2062; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 2063; AVX1-NEXT: # xmm2 = mem[0,0] 2064; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 2065; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 2066; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 2067; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 2068; AVX1-NEXT: retq 2069; 2070; AVX2-LABEL: PR52719: 2071; AVX2: # %bb.0: 2072; AVX2-NEXT: vmovd %edi, %xmm1 2073; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 2074; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 2075; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 2076; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 2077; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 2078; AVX2-NEXT: retq 2079; 2080; XOPAVX1-LABEL: PR52719: 2081; XOPAVX1: # %bb.0: 2082; XOPAVX1-NEXT: vmovd %edi, %xmm1 2083; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 2084; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2085; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 2086; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 2087; XOPAVX1-NEXT: retq 2088; 2089; XOPAVX2-LABEL: PR52719: 2090; XOPAVX2: # %bb.0: 2091; XOPAVX2-NEXT: vmovd %edi, %xmm1 2092; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 2093; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2094; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 2095; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 2096; XOPAVX2-NEXT: retq 2097; 2098; AVX512-LABEL: PR52719: 2099; AVX512: # %bb.0: 2100; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2101; AVX512-NEXT: vmovd %edi, %xmm1 2102; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 2103; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2104; AVX512-NEXT: vzeroupper 2105; AVX512-NEXT: retq 2106; 2107; AVX512VL-LABEL: PR52719: 2108; AVX512VL: # %bb.0: 2109; AVX512VL-NEXT: vmovd %edi, %xmm1 2110; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 2111; AVX512VL-NEXT: retq 2112; 2113; X86-SSE-LABEL: PR52719: 2114; X86-SSE: # %bb.0: 2115; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2116; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 2117; X86-SSE-NEXT: psrlq %xmm1, %xmm2 2118; X86-SSE-NEXT: psrlq %xmm1, %xmm0 2119; X86-SSE-NEXT: pxor %xmm2, %xmm0 2120; X86-SSE-NEXT: psubq %xmm2, %xmm0 2121; X86-SSE-NEXT: retl 2122 %vec = insertelement <2 x i32> poison, i32 %a1, i64 0 2123 %splat = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer 2124 %zext = zext <2 x i32> %splat to <2 x i64> 2125 %ashr = ashr <2 x i64> %a0, %zext 2126 ret <2 x i64> %ashr 2127} 2128