1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 10; 11; 32-bit runs to make sure we do reasonable things for i64 shifts. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1 13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2 14 15; 16; Variable Shifts 17; 18 19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 20; AVX1-LABEL: var_shift_v4i64: 21; AVX1: # %bb.0: 22; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 23; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] 24; AVX1-NEXT: # xmm3 = mem[0,0] 25; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 26; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 27; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 28; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 29; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 30; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 31; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 32; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 33; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 34; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 35; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 36; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 37; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 38; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 39; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 40; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 41; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 42; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 43; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 44; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 45; AVX1-NEXT: retq 46; 47; AVX2-LABEL: var_shift_v4i64: 48; AVX2: # %bb.0: 49; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 50; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 51; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 52; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 53; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 54; AVX2-NEXT: retq 55; 56; XOPAVX1-LABEL: var_shift_v4i64: 57; XOPAVX1: # %bb.0: 58; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 59; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 60; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 61; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 62; XOPAVX1-NEXT: vpshaq %xmm2, %xmm4, %xmm2 63; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 64; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 65; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 66; XOPAVX1-NEXT: retq 67; 68; XOPAVX2-LABEL: var_shift_v4i64: 69; XOPAVX2: # %bb.0: 70; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 71; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 72; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 73; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 74; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 75; XOPAVX2-NEXT: retq 76; 77; AVX512-LABEL: var_shift_v4i64: 78; AVX512: # %bb.0: 79; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 80; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 81; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 82; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 83; AVX512-NEXT: retq 84; 85; AVX512VL-LABEL: var_shift_v4i64: 86; AVX512VL: # %bb.0: 87; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0 88; AVX512VL-NEXT: retq 89; 90; X86-AVX1-LABEL: var_shift_v4i64: 91; X86-AVX1: # %bb.0: 92; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 93; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648] 94; X86-AVX1-NEXT: # xmm3 = mem[0,0] 95; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 96; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 97; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 98; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 99; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 100; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 101; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 102; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 103; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 104; X86-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 105; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 106; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 107; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 108; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 109; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 110; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 111; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 112; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 113; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 114; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 115; X86-AVX1-NEXT: retl 116; 117; X86-AVX2-LABEL: var_shift_v4i64: 118; X86-AVX2: # %bb.0: 119; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 120; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 121; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 122; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 123; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 124; X86-AVX2-NEXT: retl 125 %shift = ashr <4 x i64> %a, %b 126 ret <4 x i64> %shift 127} 128 129define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 130; AVX1-LABEL: var_shift_v8i32: 131; AVX1: # %bb.0: 132; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 133; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 134; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 135; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 136; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 137; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 138; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 139; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 140; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 141; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 142; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 143; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 144; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 145; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 146; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 147; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 148; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 149; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 150; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 151; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 152; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 153; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 154; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 155; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 156; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 157; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 158; AVX1-NEXT: retq 159; 160; AVX2-LABEL: var_shift_v8i32: 161; AVX2: # %bb.0: 162; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 163; AVX2-NEXT: retq 164; 165; XOPAVX1-LABEL: var_shift_v8i32: 166; XOPAVX1: # %bb.0: 167; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 168; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 169; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 170; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 171; XOPAVX1-NEXT: vpshad %xmm2, %xmm4, %xmm2 172; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 173; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 174; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 175; XOPAVX1-NEXT: retq 176; 177; XOPAVX2-LABEL: var_shift_v8i32: 178; XOPAVX2: # %bb.0: 179; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 180; XOPAVX2-NEXT: retq 181; 182; AVX512-LABEL: var_shift_v8i32: 183; AVX512: # %bb.0: 184; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0 185; AVX512-NEXT: retq 186; 187; AVX512VL-LABEL: var_shift_v8i32: 188; AVX512VL: # %bb.0: 189; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 190; AVX512VL-NEXT: retq 191; 192; X86-AVX1-LABEL: var_shift_v8i32: 193; X86-AVX1: # %bb.0: 194; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 195; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 196; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 197; X86-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 198; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 199; X86-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 200; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 201; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 202; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 203; X86-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 204; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 205; X86-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 206; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 207; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 208; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 209; X86-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 210; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 211; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 212; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 213; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 214; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 215; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 216; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 217; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 218; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 219; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 220; X86-AVX1-NEXT: retl 221; 222; X86-AVX2-LABEL: var_shift_v8i32: 223; X86-AVX2: # %bb.0: 224; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 225; X86-AVX2-NEXT: retl 226 %shift = ashr <8 x i32> %a, %b 227 ret <8 x i32> %shift 228} 229 230define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 231; AVX1-LABEL: var_shift_v16i16: 232; AVX1: # %bb.0: 233; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 234; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 235; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 236; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 237; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 238; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 239; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 240; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 241; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 242; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 243; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 244; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 245; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 246; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 247; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 248; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 249; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 250; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 251; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 252; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 253; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 254; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 255; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 256; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 257; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 258; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 259; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 260; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 261; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 262; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 263; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 264; AVX1-NEXT: retq 265; 266; AVX2-LABEL: var_shift_v16i16: 267; AVX2: # %bb.0: 268; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 269; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 270; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 271; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 272; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 273; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 274; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 275; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 276; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 277; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 278; AVX2-NEXT: retq 279; 280; XOPAVX1-LABEL: var_shift_v16i16: 281; XOPAVX1: # %bb.0: 282; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 283; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 284; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 285; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 286; XOPAVX1-NEXT: vpshaw %xmm2, %xmm4, %xmm2 287; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 288; XOPAVX1-NEXT: vpshaw %xmm1, %xmm0, %xmm0 289; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 290; XOPAVX1-NEXT: retq 291; 292; XOPAVX2-LABEL: var_shift_v16i16: 293; XOPAVX2: # %bb.0: 294; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 295; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 296; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 297; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 298; XOPAVX2-NEXT: vpshaw %xmm2, %xmm4, %xmm2 299; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 300; XOPAVX2-NEXT: vpshaw %xmm1, %xmm0, %xmm0 301; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 302; XOPAVX2-NEXT: retq 303; 304; AVX512DQ-LABEL: var_shift_v16i16: 305; AVX512DQ: # %bb.0: 306; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 307; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 308; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 309; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 310; AVX512DQ-NEXT: retq 311; 312; AVX512BW-LABEL: var_shift_v16i16: 313; AVX512BW: # %bb.0: 314; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 315; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 316; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 317; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 318; AVX512BW-NEXT: retq 319; 320; AVX512DQVL-LABEL: var_shift_v16i16: 321; AVX512DQVL: # %bb.0: 322; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 323; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 324; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 325; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0 326; AVX512DQVL-NEXT: retq 327; 328; AVX512BWVL-LABEL: var_shift_v16i16: 329; AVX512BWVL: # %bb.0: 330; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 331; AVX512BWVL-NEXT: retq 332; 333; X86-AVX1-LABEL: var_shift_v16i16: 334; X86-AVX1: # %bb.0: 335; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 336; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 337; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 338; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 339; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 340; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 341; X86-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 342; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 343; X86-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 344; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 345; X86-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 346; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 347; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 348; X86-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 349; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 350; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 351; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 352; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 353; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 354; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 355; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 356; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 357; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 358; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 359; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 360; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 361; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 362; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 363; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 364; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 365; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 366; X86-AVX1-NEXT: retl 367; 368; X86-AVX2-LABEL: var_shift_v16i16: 369; X86-AVX2: # %bb.0: 370; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 371; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 372; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 373; X86-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 374; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 375; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 376; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 377; X86-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 378; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 379; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 380; X86-AVX2-NEXT: retl 381 %shift = ashr <16 x i16> %a, %b 382 ret <16 x i16> %shift 383} 384 385define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 386; AVX1-LABEL: var_shift_v32i8: 387; AVX1: # %bb.0: 388; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 389; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 390; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 391; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 392; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 393; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 394; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 395; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 396; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 397; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 398; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 399; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 400; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 401; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 402; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 403; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 404; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 405; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 406; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 407; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 408; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 409; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 410; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 411; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 412; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 413; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 414; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 415; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 416; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 417; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 418; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 419; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 420; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 421; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 422; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 423; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 424; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 425; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 426; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 427; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 428; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 429; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 430; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 431; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 432; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 433; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 434; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 435; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 436; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 437; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 438; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 439; AVX1-NEXT: retq 440; 441; AVX2-LABEL: var_shift_v32i8: 442; AVX2: # %bb.0: 443; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 444; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 445; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 446; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 447; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 448; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 449; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 450; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 451; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 452; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 453; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 454; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 455; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 456; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 457; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 458; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 459; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 460; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 461; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 462; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 463; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 464; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 465; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 466; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 467; AVX2-NEXT: retq 468; 469; XOPAVX1-LABEL: var_shift_v32i8: 470; XOPAVX1: # %bb.0: 471; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 472; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 473; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 474; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 475; XOPAVX1-NEXT: vpshab %xmm2, %xmm4, %xmm2 476; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 477; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 478; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 479; XOPAVX1-NEXT: retq 480; 481; XOPAVX2-LABEL: var_shift_v32i8: 482; XOPAVX2: # %bb.0: 483; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 484; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 485; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 486; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 487; XOPAVX2-NEXT: vpshab %xmm2, %xmm4, %xmm2 488; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 489; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 490; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 491; XOPAVX2-NEXT: retq 492; 493; AVX512DQ-LABEL: var_shift_v32i8: 494; AVX512DQ: # %bb.0: 495; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 496; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 497; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 498; AVX512DQ-NEXT: vpsraw $4, %ymm3, %ymm4 499; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 500; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 501; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 502; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 503; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 504; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 505; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 506; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 507; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 508; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 509; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 510; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 511; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm3 512; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 513; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 514; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 515; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 516; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 517; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 518; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 519; AVX512DQ-NEXT: retq 520; 521; AVX512BW-LABEL: var_shift_v32i8: 522; AVX512BW: # %bb.0: 523; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 524; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 525; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 526; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 527; AVX512BW-NEXT: retq 528; 529; AVX512DQVL-LABEL: var_shift_v32i8: 530; AVX512DQVL: # %bb.0: 531; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 532; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 533; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 534; AVX512DQVL-NEXT: vpsraw $4, %ymm3, %ymm4 535; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 536; AVX512DQVL-NEXT: vpsraw $2, %ymm3, %ymm4 537; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 538; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 539; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4 540; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 541; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 542; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 543; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 544; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 545; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3 546; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 547; AVX512DQVL-NEXT: vpsraw $2, %ymm0, %ymm3 548; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 549; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 550; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3 551; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 552; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 553; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 554; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 555; AVX512DQVL-NEXT: retq 556; 557; AVX512BWVL-LABEL: var_shift_v32i8: 558; AVX512BWVL: # %bb.0: 559; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 560; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 561; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 562; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 563; AVX512BWVL-NEXT: retq 564; 565; X86-AVX1-LABEL: var_shift_v32i8: 566; X86-AVX1: # %bb.0: 567; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 568; X86-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 569; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 570; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 571; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 572; X86-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 573; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 574; X86-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 575; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 576; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 577; X86-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 578; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 579; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 580; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 581; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 582; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 583; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 584; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 585; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 586; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 587; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 588; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 589; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 590; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 591; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 592; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 593; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 594; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 595; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 596; X86-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 597; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 598; X86-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 599; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 600; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 601; X86-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 602; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 603; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 604; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 605; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 606; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 607; X86-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 608; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 609; X86-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 610; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 611; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 612; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 613; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 614; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 615; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 616; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 617; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 618; X86-AVX1-NEXT: retl 619; 620; X86-AVX2-LABEL: var_shift_v32i8: 621; X86-AVX2: # %bb.0: 622; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 623; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 624; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 625; X86-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 626; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 627; X86-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 628; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 629; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 630; X86-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 631; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 632; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 633; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 634; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 635; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 636; X86-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 637; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 638; X86-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 639; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 640; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 641; X86-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 642; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 643; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 644; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 645; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 646; X86-AVX2-NEXT: retl 647 %shift = ashr <32 x i8> %a, %b 648 ret <32 x i8> %shift 649} 650 651; 652; Uniform Variable Shifts 653; 654 655define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 656; AVX1-LABEL: splatvar_shift_v4i64: 657; AVX1: # %bb.0: 658; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 659; AVX1-NEXT: # xmm2 = mem[0,0] 660; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 661; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 662; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 663; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 664; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 665; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 666; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 667; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 668; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 669; AVX1-NEXT: retq 670; 671; AVX2-LABEL: splatvar_shift_v4i64: 672; AVX2: # %bb.0: 673; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 674; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 675; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 676; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 677; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 678; AVX2-NEXT: retq 679; 680; XOPAVX1-LABEL: splatvar_shift_v4i64: 681; XOPAVX1: # %bb.0: 682; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 683; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 684; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 685; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 686; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 687; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 688; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 689; XOPAVX1-NEXT: retq 690; 691; XOPAVX2-LABEL: splatvar_shift_v4i64: 692; XOPAVX2: # %bb.0: 693; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 694; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 695; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 696; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 697; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 698; XOPAVX2-NEXT: retq 699; 700; AVX512-LABEL: splatvar_shift_v4i64: 701; AVX512: # %bb.0: 702; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 703; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 704; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 705; AVX512-NEXT: retq 706; 707; AVX512VL-LABEL: splatvar_shift_v4i64: 708; AVX512VL: # %bb.0: 709; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 710; AVX512VL-NEXT: retq 711; 712; X86-AVX1-LABEL: splatvar_shift_v4i64: 713; X86-AVX1: # %bb.0: 714; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 715; X86-AVX1-NEXT: # xmm2 = mem[0,0] 716; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 717; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 718; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 719; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 720; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 721; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 722; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 723; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 724; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 725; X86-AVX1-NEXT: retl 726; 727; X86-AVX2-LABEL: splatvar_shift_v4i64: 728; X86-AVX2: # %bb.0: 729; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 730; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 731; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 732; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 733; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 734; X86-AVX2-NEXT: retl 735 %splat = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer 736 %shift = ashr <4 x i64> %a, %splat 737 ret <4 x i64> %shift 738} 739 740define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 741; AVX1-LABEL: splatvar_shift_v8i32: 742; AVX1: # %bb.0: 743; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 744; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 745; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 746; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 747; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 748; AVX1-NEXT: retq 749; 750; AVX2-LABEL: splatvar_shift_v8i32: 751; AVX2: # %bb.0: 752; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 753; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 754; AVX2-NEXT: retq 755; 756; XOPAVX1-LABEL: splatvar_shift_v8i32: 757; XOPAVX1: # %bb.0: 758; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 759; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 760; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 761; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 762; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 763; XOPAVX1-NEXT: retq 764; 765; XOPAVX2-LABEL: splatvar_shift_v8i32: 766; XOPAVX2: # %bb.0: 767; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 768; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 769; XOPAVX2-NEXT: retq 770; 771; AVX512-LABEL: splatvar_shift_v8i32: 772; AVX512: # %bb.0: 773; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 774; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 775; AVX512-NEXT: retq 776; 777; AVX512VL-LABEL: splatvar_shift_v8i32: 778; AVX512VL: # %bb.0: 779; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 780; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 781; AVX512VL-NEXT: retq 782; 783; X86-AVX1-LABEL: splatvar_shift_v8i32: 784; X86-AVX1: # %bb.0: 785; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 786; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 787; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 788; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 789; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 790; X86-AVX1-NEXT: retl 791; 792; X86-AVX2-LABEL: splatvar_shift_v8i32: 793; X86-AVX2: # %bb.0: 794; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 795; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 796; X86-AVX2-NEXT: retl 797 %splat = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer 798 %shift = ashr <8 x i32> %a, %splat 799 ret <8 x i32> %shift 800} 801 802define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 803; AVX1-LABEL: splatvar_shift_v16i16: 804; AVX1: # %bb.0: 805; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 806; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 807; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 808; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 809; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 810; AVX1-NEXT: retq 811; 812; AVX2-LABEL: splatvar_shift_v16i16: 813; AVX2: # %bb.0: 814; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 815; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 816; AVX2-NEXT: retq 817; 818; XOPAVX1-LABEL: splatvar_shift_v16i16: 819; XOPAVX1: # %bb.0: 820; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 821; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 822; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 823; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 824; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 825; XOPAVX1-NEXT: retq 826; 827; XOPAVX2-LABEL: splatvar_shift_v16i16: 828; XOPAVX2: # %bb.0: 829; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 830; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 831; XOPAVX2-NEXT: retq 832; 833; AVX512-LABEL: splatvar_shift_v16i16: 834; AVX512: # %bb.0: 835; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 836; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0 837; AVX512-NEXT: retq 838; 839; AVX512VL-LABEL: splatvar_shift_v16i16: 840; AVX512VL: # %bb.0: 841; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 842; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 843; AVX512VL-NEXT: retq 844; 845; X86-AVX1-LABEL: splatvar_shift_v16i16: 846; X86-AVX1: # %bb.0: 847; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 848; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 849; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 850; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 851; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 852; X86-AVX1-NEXT: retl 853; 854; X86-AVX2-LABEL: splatvar_shift_v16i16: 855; X86-AVX2: # %bb.0: 856; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 857; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 858; X86-AVX2-NEXT: retl 859 %splat = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer 860 %shift = ashr <16 x i16> %a, %splat 861 ret <16 x i16> %shift 862} 863 864define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 865; AVX1-LABEL: splatvar_shift_v32i8: 866; AVX1: # %bb.0: 867; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 868; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 869; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 870; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 871; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 872; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 873; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 874; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 875; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 876; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 877; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 878; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 879; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 880; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 881; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 882; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 883; AVX1-NEXT: retq 884; 885; AVX2-LABEL: splatvar_shift_v32i8: 886; AVX2: # %bb.0: 887; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 888; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 889; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 890; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 891; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 892; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 893; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 894; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 895; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 896; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 897; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 898; AVX2-NEXT: retq 899; 900; XOPAVX1-LABEL: splatvar_shift_v32i8: 901; XOPAVX1: # %bb.0: 902; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 903; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 904; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 905; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 906; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 907; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 908; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 909; XOPAVX1-NEXT: retq 910; 911; XOPAVX2-LABEL: splatvar_shift_v32i8: 912; XOPAVX2: # %bb.0: 913; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 914; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 915; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 916; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 917; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2 918; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 919; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 920; XOPAVX2-NEXT: retq 921; 922; AVX512DQ-LABEL: splatvar_shift_v32i8: 923; AVX512DQ: # %bb.0: 924; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 925; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 926; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 927; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 928; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 929; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 930; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 931; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 932; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 933; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 934; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 935; AVX512DQ-NEXT: retq 936; 937; AVX512BW-LABEL: splatvar_shift_v32i8: 938; AVX512BW: # %bb.0: 939; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 940; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 941; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 942; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 943; AVX512BW-NEXT: retq 944; 945; AVX512DQVL-LABEL: splatvar_shift_v32i8: 946; AVX512DQVL: # %bb.0: 947; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 948; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 949; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 950; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 951; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 952; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 953; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 954; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 955; AVX512DQVL-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & ymm0) 956; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0 957; AVX512DQVL-NEXT: retq 958; 959; AVX512BWVL-LABEL: splatvar_shift_v32i8: 960; AVX512BWVL: # %bb.0: 961; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 962; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 963; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 964; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 965; AVX512BWVL-NEXT: retq 966; 967; X86-AVX1-LABEL: splatvar_shift_v32i8: 968; X86-AVX1: # %bb.0: 969; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 970; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 971; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 972; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 973; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 974; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 975; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 976; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 977; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 978; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 979; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 980; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 981; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 982; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 983; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 984; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 985; X86-AVX1-NEXT: retl 986; 987; X86-AVX2-LABEL: splatvar_shift_v32i8: 988; X86-AVX2: # %bb.0: 989; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 990; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 991; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 992; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 993; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 994; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 995; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 996; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 997; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 998; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 999; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1000; X86-AVX2-NEXT: retl 1001 %splat = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer 1002 %shift = ashr <32 x i8> %a, %splat 1003 ret <32 x i8> %shift 1004} 1005 1006; 1007; Uniform Variable Modulo Shifts 1008; 1009 1010define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 1011; AVX1-LABEL: splatvar_modulo_shift_v4i64: 1012; AVX1: # %bb.0: 1013; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1014; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 1015; AVX1-NEXT: # xmm2 = mem[0,0] 1016; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 1017; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1018; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 1019; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 1020; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 1021; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 1022; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1023; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1024; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1025; AVX1-NEXT: retq 1026; 1027; AVX2-LABEL: splatvar_modulo_shift_v4i64: 1028; AVX2: # %bb.0: 1029; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1030; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 1031; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 1032; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 1033; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1034; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 1035; AVX2-NEXT: retq 1036; 1037; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: 1038; XOPAVX1: # %bb.0: 1039; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1040; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1041; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 1042; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 1043; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1044; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 1045; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 1046; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1047; XOPAVX1-NEXT: retq 1048; 1049; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: 1050; XOPAVX2: # %bb.0: 1051; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1052; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 1053; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 1054; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 1055; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1056; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 1057; XOPAVX2-NEXT: retq 1058; 1059; AVX512-LABEL: splatvar_modulo_shift_v4i64: 1060; AVX512: # %bb.0: 1061; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1062; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1063; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 1064; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1065; AVX512-NEXT: retq 1066; 1067; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: 1068; AVX512VL: # %bb.0: 1069; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 1070; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 1071; AVX512VL-NEXT: retq 1072; 1073; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: 1074; X86-AVX1: # %bb.0: 1075; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1076; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 1077; X86-AVX1-NEXT: # xmm2 = mem[0,0] 1078; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 1079; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1080; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 1081; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 1082; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 1083; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 1084; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1085; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1086; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1087; X86-AVX1-NEXT: retl 1088; 1089; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: 1090; X86-AVX2: # %bb.0: 1091; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1092; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 1093; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 1094; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 1095; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1096; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 1097; X86-AVX2-NEXT: retl 1098 %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63> 1099 %splat = shufflevector <4 x i64> %mod, <4 x i64> poison, <4 x i32> zeroinitializer 1100 %shift = ashr <4 x i64> %a, %splat 1101 ret <4 x i64> %shift 1102} 1103 1104define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 1105; AVX1-LABEL: splatvar_modulo_shift_v8i32: 1106; AVX1: # %bb.0: 1107; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1108; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1109; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 1110; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1111; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1112; AVX1-NEXT: retq 1113; 1114; AVX2-LABEL: splatvar_modulo_shift_v8i32: 1115; AVX2: # %bb.0: 1116; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1117; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 1118; AVX2-NEXT: retq 1119; 1120; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32: 1121; XOPAVX1: # %bb.0: 1122; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1123; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1124; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 1125; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1126; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1127; XOPAVX1-NEXT: retq 1128; 1129; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32: 1130; XOPAVX2: # %bb.0: 1131; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1132; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 1133; XOPAVX2-NEXT: retq 1134; 1135; AVX512-LABEL: splatvar_modulo_shift_v8i32: 1136; AVX512: # %bb.0: 1137; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1138; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 1139; AVX512-NEXT: retq 1140; 1141; AVX512VL-LABEL: splatvar_modulo_shift_v8i32: 1142; AVX512VL: # %bb.0: 1143; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1144; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 1145; AVX512VL-NEXT: retq 1146; 1147; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32: 1148; X86-AVX1: # %bb.0: 1149; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1150; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1151; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 1152; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 1153; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1154; X86-AVX1-NEXT: retl 1155; 1156; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32: 1157; X86-AVX2: # %bb.0: 1158; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1159; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 1160; X86-AVX2-NEXT: retl 1161 %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 1162 %splat = shufflevector <8 x i32> %mod, <8 x i32> poison, <8 x i32> zeroinitializer 1163 %shift = ashr <8 x i32> %a, %splat 1164 ret <8 x i32> %shift 1165} 1166 1167define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 1168; AVX1-LABEL: splatvar_modulo_shift_v16i16: 1169; AVX1: # %bb.0: 1170; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1171; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1172; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 1173; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1174; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1175; AVX1-NEXT: retq 1176; 1177; AVX2-LABEL: splatvar_modulo_shift_v16i16: 1178; AVX2: # %bb.0: 1179; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1180; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1181; AVX2-NEXT: retq 1182; 1183; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16: 1184; XOPAVX1: # %bb.0: 1185; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1186; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1187; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 1188; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1189; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1190; XOPAVX1-NEXT: retq 1191; 1192; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16: 1193; XOPAVX2: # %bb.0: 1194; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1195; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1196; XOPAVX2-NEXT: retq 1197; 1198; AVX512-LABEL: splatvar_modulo_shift_v16i16: 1199; AVX512: # %bb.0: 1200; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1201; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1202; AVX512-NEXT: retq 1203; 1204; AVX512VL-LABEL: splatvar_modulo_shift_v16i16: 1205; AVX512VL: # %bb.0: 1206; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1207; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1208; AVX512VL-NEXT: retq 1209; 1210; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16: 1211; X86-AVX1: # %bb.0: 1212; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1213; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1214; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 1215; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 1216; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1217; X86-AVX1-NEXT: retl 1218; 1219; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16: 1220; X86-AVX2: # %bb.0: 1221; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1222; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 1223; X86-AVX2-NEXT: retl 1224 %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 1225 %splat = shufflevector <16 x i16> %mod, <16 x i16> poison, <16 x i32> zeroinitializer 1226 %shift = ashr <16 x i16> %a, %splat 1227 ret <16 x i16> %shift 1228} 1229 1230define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 1231; AVX1-LABEL: splatvar_modulo_shift_v32i8: 1232; AVX1: # %bb.0: 1233; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1234; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1235; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1236; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1237; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 1238; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1239; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1240; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 1241; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 1242; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 1243; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 1244; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1245; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1246; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 1247; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 1248; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1249; AVX1-NEXT: retq 1250; 1251; AVX2-LABEL: splatvar_modulo_shift_v32i8: 1252; AVX2: # %bb.0: 1253; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1254; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1255; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1256; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1257; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1258; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1259; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1260; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1261; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 1262; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1263; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1264; AVX2-NEXT: retq 1265; 1266; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: 1267; XOPAVX1: # %bb.0: 1268; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1269; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1270; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1271; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1272; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1273; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 1274; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 1275; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1276; XOPAVX1-NEXT: retq 1277; 1278; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: 1279; XOPAVX2: # %bb.0: 1280; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1281; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1282; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1283; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1284; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1285; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2 1286; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 1287; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1288; XOPAVX2-NEXT: retq 1289; 1290; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8: 1291; AVX512DQ: # %bb.0: 1292; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1293; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1294; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1295; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1296; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 1297; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 1298; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 1299; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1300; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 1301; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 1302; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1303; AVX512DQ-NEXT: retq 1304; 1305; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: 1306; AVX512BW: # %bb.0: 1307; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1308; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1309; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1310; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 1311; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1312; AVX512BW-NEXT: retq 1313; 1314; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8: 1315; AVX512DQVL: # %bb.0: 1316; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1317; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1318; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 1319; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 1320; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1321; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 1322; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 1323; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 1324; AVX512DQVL-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & ymm0) 1325; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0 1326; AVX512DQVL-NEXT: retq 1327; 1328; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: 1329; AVX512BWVL: # %bb.0: 1330; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1331; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1332; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1333; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 1334; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1335; AVX512BWVL-NEXT: retq 1336; 1337; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8: 1338; X86-AVX1: # %bb.0: 1339; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1340; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1341; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1342; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1343; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 1344; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1345; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1346; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] 1347; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 1348; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 1349; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 1350; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1351; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1352; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 1353; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 1354; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1355; X86-AVX1-NEXT: retl 1356; 1357; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: 1358; X86-AVX2: # %bb.0: 1359; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1360; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1361; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1362; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1363; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1364; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1365; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1366; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1367; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 1368; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1369; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1370; X86-AVX2-NEXT: retl 1371 %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 1372 %splat = shufflevector <32 x i8> %mod, <32 x i8> poison, <32 x i32> zeroinitializer 1373 %shift = ashr <32 x i8> %a, %splat 1374 ret <32 x i8> %shift 1375} 1376 1377; 1378; Constant Shifts 1379; 1380 1381define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { 1382; AVX1-LABEL: constant_shift_v4i64: 1383; AVX1: # %bb.0: 1384; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1385; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1386; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1387; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1388; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] 1389; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1390; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1391; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1392; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1393; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1394; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936] 1395; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1396; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1397; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1398; AVX1-NEXT: retq 1399; 1400; AVX2-LABEL: constant_shift_v4i64: 1401; AVX2: # %bb.0: 1402; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1403; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] 1404; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1405; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1406; AVX2-NEXT: retq 1407; 1408; XOPAVX1-LABEL: constant_shift_v4i64: 1409; XOPAVX1: # %bb.0: 1410; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1411; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1412; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1413; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1414; XOPAVX1-NEXT: retq 1415; 1416; XOPAVX2-LABEL: constant_shift_v4i64: 1417; XOPAVX2: # %bb.0: 1418; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1419; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] 1420; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1421; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1422; XOPAVX2-NEXT: retq 1423; 1424; AVX512-LABEL: constant_shift_v4i64: 1425; AVX512: # %bb.0: 1426; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1427; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,31,62] 1428; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 1429; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1430; AVX512-NEXT: retq 1431; 1432; AVX512VL-LABEL: constant_shift_v4i64: 1433; AVX512VL: # %bb.0: 1434; AVX512VL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1435; AVX512VL-NEXT: retq 1436; 1437; X86-AVX1-LABEL: constant_shift_v4i64: 1438; X86-AVX1: # %bb.0: 1439; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1440; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1441; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1442; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1443; X86-AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] 1444; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1445; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1446; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1447; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1448; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1449; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216] 1450; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1451; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 1452; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1453; X86-AVX1-NEXT: retl 1454; 1455; X86-AVX2-LABEL: constant_shift_v4i64: 1456; X86-AVX2: # %bb.0: 1457; X86-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1458; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1073741824,0,16777216,0,1,2,0] 1459; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1460; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1461; X86-AVX2-NEXT: retl 1462 %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> 1463 ret <4 x i64> %shift 1464} 1465 1466define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { 1467; AVX1-LABEL: constant_shift_v8i32: 1468; AVX1: # %bb.0: 1469; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1470; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1471; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1472; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1473; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 1474; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1475; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1476; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1477; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1478; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 1479; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1480; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 1481; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1482; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1483; AVX1-NEXT: retq 1484; 1485; AVX2-LABEL: constant_shift_v8i32: 1486; AVX2: # %bb.0: 1487; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1488; AVX2-NEXT: retq 1489; 1490; XOPAVX1-LABEL: constant_shift_v8i32: 1491; XOPAVX1: # %bb.0: 1492; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1493; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1494; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1495; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1496; XOPAVX1-NEXT: retq 1497; 1498; XOPAVX2-LABEL: constant_shift_v8i32: 1499; XOPAVX2: # %bb.0: 1500; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1501; XOPAVX2-NEXT: retq 1502; 1503; AVX512-LABEL: constant_shift_v8i32: 1504; AVX512: # %bb.0: 1505; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1506; AVX512-NEXT: retq 1507; 1508; AVX512VL-LABEL: constant_shift_v8i32: 1509; AVX512VL: # %bb.0: 1510; AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1511; AVX512VL-NEXT: retq 1512; 1513; X86-AVX1-LABEL: constant_shift_v8i32: 1514; X86-AVX1: # %bb.0: 1515; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1516; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1517; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1518; X86-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1519; X86-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 1520; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1521; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1522; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1523; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 1524; X86-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 1525; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1526; X86-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 1527; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1528; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1529; X86-AVX1-NEXT: retl 1530; 1531; X86-AVX2-LABEL: constant_shift_v8i32: 1532; X86-AVX2: # %bb.0: 1533; X86-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1534; X86-AVX2-NEXT: retl 1535 %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 1536 ret <8 x i32> %shift 1537} 1538 1539define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { 1540; AVX1-LABEL: constant_shift_v16i16: 1541; AVX1: # %bb.0: 1542; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,16384,8192,4096,2048,1024,512] 1543; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1544; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1545; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] 1546; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1547; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1548; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1549; AVX1-NEXT: retq 1550; 1551; AVX2-LABEL: constant_shift_v16i16: 1552; AVX2: # %bb.0: 1553; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,u,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1554; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1555; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 1556; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] 1557; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1558; AVX2-NEXT: retq 1559; 1560; XOPAVX1-LABEL: constant_shift_v16i16: 1561; XOPAVX1: # %bb.0: 1562; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1563; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1564; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1565; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1566; XOPAVX1-NEXT: retq 1567; 1568; XOPAVX2-LABEL: constant_shift_v16i16: 1569; XOPAVX2: # %bb.0: 1570; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1571; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1572; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1573; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1574; XOPAVX2-NEXT: retq 1575; 1576; AVX512DQ-LABEL: constant_shift_v16i16: 1577; AVX512DQ: # %bb.0: 1578; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1579; AVX512DQ-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1580; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1581; AVX512DQ-NEXT: retq 1582; 1583; AVX512BW-LABEL: constant_shift_v16i16: 1584; AVX512BW: # %bb.0: 1585; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1586; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1587; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1588; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1589; AVX512BW-NEXT: retq 1590; 1591; AVX512DQVL-LABEL: constant_shift_v16i16: 1592; AVX512DQVL: # %bb.0: 1593; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1594; AVX512DQVL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1595; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0 1596; AVX512DQVL-NEXT: retq 1597; 1598; AVX512BWVL-LABEL: constant_shift_v16i16: 1599; AVX512BWVL: # %bb.0: 1600; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1601; AVX512BWVL-NEXT: retq 1602; 1603; X86-AVX1-LABEL: constant_shift_v16i16: 1604; X86-AVX1: # %bb.0: 1605; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,u,16384,8192,4096,2048,1024,512] 1606; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1607; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1608; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] 1609; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1610; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1611; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1612; X86-AVX1-NEXT: retl 1613; 1614; X86-AVX2-LABEL: constant_shift_v16i16: 1615; X86-AVX2: # %bb.0: 1616; X86-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [u,u,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1617; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1618; X86-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 1619; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] 1620; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1621; X86-AVX2-NEXT: retl 1622 %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1623 ret <16 x i16> %shift 1624} 1625 1626define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind { 1627; AVX1-LABEL: constant_shift_v16i16_pairs: 1628; AVX1: # %bb.0: 1629; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,u,u,8192,8192,16384,16384] 1630; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1631; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1632; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] 1633; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1634; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1024,1024,512,512,2048,2048,4096,4096] 1635; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1636; AVX1-NEXT: retq 1637; 1638; AVX2-LABEL: constant_shift_v16i16_pairs: 1639; AVX2: # %bb.0: 1640; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1641; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1642; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,16384,16384,4096,4096,8192,8192,512,512,256,256,1024,1024,2048,2048] 1643; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1644; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1645; AVX2-NEXT: retq 1646; 1647; XOPAVX1-LABEL: constant_shift_v16i16_pairs: 1648; XOPAVX1: # %bb.0: 1649; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1650; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1651; XOPAVX1-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1652; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1653; XOPAVX1-NEXT: retq 1654; 1655; XOPAVX2-LABEL: constant_shift_v16i16_pairs: 1656; XOPAVX2: # %bb.0: 1657; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1658; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1659; XOPAVX2-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1660; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1661; XOPAVX2-NEXT: retq 1662; 1663; AVX512DQ-LABEL: constant_shift_v16i16_pairs: 1664; AVX512DQ: # %bb.0: 1665; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1666; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1667; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,16384,16384,4096,4096,8192,8192,512,512,256,256,1024,1024,2048,2048] 1668; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 1669; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1670; AVX512DQ-NEXT: retq 1671; 1672; AVX512BW-LABEL: constant_shift_v16i16_pairs: 1673; AVX512BW: # %bb.0: 1674; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1675; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,1,1,3,3,2,2,6,6,7,7,5,5,4,4] 1676; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1677; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1678; AVX512BW-NEXT: retq 1679; 1680; AVX512DQVL-LABEL: constant_shift_v16i16_pairs: 1681; AVX512DQVL: # %bb.0: 1682; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1683; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,16384,16384,4096,4096,8192,8192,512,512,256,256,1024,1024,2048,2048] 1684; AVX512DQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 1685; AVX512DQVL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1686; AVX512DQVL-NEXT: retq 1687; 1688; AVX512BWVL-LABEL: constant_shift_v16i16_pairs: 1689; AVX512BWVL: # %bb.0: 1690; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1691; AVX512BWVL-NEXT: retq 1692; 1693; X86-AVX1-LABEL: constant_shift_v16i16_pairs: 1694; X86-AVX1: # %bb.0: 1695; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,u,u,u,8192,8192,16384,16384] 1696; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1697; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 1698; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] 1699; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1700; X86-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [1024,1024,512,512,2048,2048,4096,4096] 1701; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1702; X86-AVX1-NEXT: retl 1703; 1704; X86-AVX2-LABEL: constant_shift_v16i16_pairs: 1705; X86-AVX2: # %bb.0: 1706; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1707; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1708; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,16384,16384,4096,4096,8192,8192,512,512,256,256,1024,1024,2048,2048] 1709; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1710; X86-AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1711; X86-AVX2-NEXT: retl 1712 %shift = ashr <16 x i16> %a, <i16 0, i16 0, i16 1, i16 1, i16 3, i16 3, i16 2, i16 2, i16 6, i16 6, i16 7, i16 7, i16 5, i16 5, i16 4, i16 4> 1713 ret <16 x i16> %shift 1714} 1715 1716define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { 1717; AVX1-LABEL: constant_shift_v32i8: 1718; AVX1: # %bb.0: 1719; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1720; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1721; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1722; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256] 1723; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1724; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1725; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1726; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1727; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2] 1728; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1729; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1730; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1731; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1732; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1733; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1734; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1735; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1736; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1737; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 1738; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1739; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1740; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1741; AVX1-NEXT: retq 1742; 1743; AVX2-LABEL: constant_shift_v32i8: 1744; AVX2: # %bb.0: 1745; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1746; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 1747; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1748; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1749; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1750; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 1751; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1752; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1753; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1754; AVX2-NEXT: retq 1755; 1756; XOPAVX1-LABEL: constant_shift_v32i8: 1757; XOPAVX1: # %bb.0: 1758; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1759; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1760; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1 1761; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0 1762; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1763; XOPAVX1-NEXT: retq 1764; 1765; XOPAVX2-LABEL: constant_shift_v32i8: 1766; XOPAVX2: # %bb.0: 1767; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1768; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1769; XOPAVX2-NEXT: vpshab %xmm2, %xmm1, %xmm1 1770; XOPAVX2-NEXT: vpshab %xmm2, %xmm0, %xmm0 1771; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1772; XOPAVX2-NEXT: retq 1773; 1774; AVX512DQ-LABEL: constant_shift_v32i8: 1775; AVX512DQ: # %bb.0: 1776; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1777; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 1778; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1779; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 1780; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1781; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 1782; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1783; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 1784; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1785; AVX512DQ-NEXT: retq 1786; 1787; AVX512BW-LABEL: constant_shift_v32i8: 1788; AVX512BW: # %bb.0: 1789; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1790; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1791; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1792; AVX512BW-NEXT: retq 1793; 1794; AVX512DQVL-LABEL: constant_shift_v32i8: 1795; AVX512DQVL: # %bb.0: 1796; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1797; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1 1798; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1799; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 1800; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1801; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0 1802; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1803; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1804; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1805; AVX512DQVL-NEXT: retq 1806; 1807; AVX512BWVL-LABEL: constant_shift_v32i8: 1808; AVX512BWVL: # %bb.0: 1809; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1810; AVX512BWVL-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1811; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1812; AVX512BWVL-NEXT: retq 1813; 1814; X86-AVX1-LABEL: constant_shift_v32i8: 1815; X86-AVX1: # %bb.0: 1816; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1817; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1818; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1819; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256] 1820; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1821; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1822; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1823; X86-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1824; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2] 1825; X86-AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1826; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1827; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1828; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1829; X86-AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 1830; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1831; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1832; X86-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1833; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1834; X86-AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 1835; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1836; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1837; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1838; X86-AVX1-NEXT: retl 1839; 1840; X86-AVX2-LABEL: constant_shift_v32i8: 1841; X86-AVX2: # %bb.0: 1842; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1843; X86-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 1844; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1845; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1846; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1847; X86-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 1848; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1849; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1850; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1851; X86-AVX2-NEXT: retl 1852 %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1853 ret <32 x i8> %shift 1854} 1855 1856define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { 1857; AVX1-LABEL: constant_shift_v32i8_pairs: 1858; AVX1: # %bb.0: 1859; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1860; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [512,16384,4096,1024,8192,2048,4096,32768] 1861; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1862; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1863; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1864; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 1865; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [2048,1024,2048,4096,16384,1024,16384,u] 1866; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6],xmm0[7] 1867; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1868; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128] 1869; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1870; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 1871; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1872; AVX1-NEXT: retq 1873; 1874; AVX2-LABEL: constant_shift_v32i8_pairs: 1875; AVX2: # %bb.0: 1876; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [2048,1024,2048,4096,16384,1024,16384,u,512,16384,4096,1024,8192,2048,4096,32768] 1877; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 1878; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1879; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1880; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1881; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1882; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1883; AVX2-NEXT: retq 1884; 1885; XOPAVX1-LABEL: constant_shift_v32i8_pairs: 1886; XOPAVX1: # %bb.0: 1887; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1888; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1889; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1890; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1891; XOPAVX1-NEXT: retq 1892; 1893; XOPAVX2-LABEL: constant_shift_v32i8_pairs: 1894; XOPAVX2: # %bb.0: 1895; XOPAVX2-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1896; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1897; XOPAVX2-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1898; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1899; XOPAVX2-NEXT: retq 1900; 1901; AVX512DQ-LABEL: constant_shift_v32i8_pairs: 1902; AVX512DQ: # %bb.0: 1903; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [2048,1024,2048,4096,16384,1024,16384,u,512,16384,4096,1024,8192,2048,4096,32768] 1904; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 1905; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1906; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1907; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1908; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 1909; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1910; AVX512DQ-NEXT: retq 1911; 1912; AVX512BW-LABEL: constant_shift_v32i8_pairs: 1913; AVX512BW: # %bb.0: 1914; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1915; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,6,5,4,2,6,2,0,7,2,4,6,3,5,4,1] 1916; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1917; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1918; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1919; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 1920; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1921; AVX512BW-NEXT: retq 1922; 1923; AVX512DQVL-LABEL: constant_shift_v32i8_pairs: 1924; AVX512DQVL: # %bb.0: 1925; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [2048,1024,2048,4096,16384,1024,16384,u,512,16384,4096,1024,8192,2048,4096,32768] 1926; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 1927; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1928; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1929; AVX512DQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 1930; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1931; AVX512DQVL-NEXT: retq 1932; 1933; AVX512BWVL-LABEL: constant_shift_v32i8_pairs: 1934; AVX512BWVL: # %bb.0: 1935; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1936; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1937; AVX512BWVL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 1938; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1939; AVX512BWVL-NEXT: retq 1940; 1941; X86-AVX1-LABEL: constant_shift_v32i8_pairs: 1942; X86-AVX1: # %bb.0: 1943; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1944; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # [512,16384,4096,1024,8192,2048,4096,32768] 1945; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1946; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1947; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1948; X86-AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 1949; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 # [2048,1024,2048,4096,16384,1024,16384,u] 1950; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6],xmm0[7] 1951; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 1952; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128] 1953; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1954; X86-AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 1955; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1956; X86-AVX1-NEXT: retl 1957; 1958; X86-AVX2-LABEL: constant_shift_v32i8_pairs: 1959; X86-AVX2: # %bb.0: 1960; X86-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [2048,1024,2048,4096,16384,1024,16384,u,512,16384,4096,1024,8192,2048,4096,32768] 1961; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] 1962; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1963; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1964; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] 1965; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1966; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1967; X86-AVX2-NEXT: retl 1968 %shift = ashr <32 x i8> %a, <i8 5, i8 5, i8 6, i8 6, i8 5, i8 5, i8 4, i8 4, i8 2, i8 2, i8 6, i8 6, i8 2, i8 2, i8 0, i8 0, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 3, i8 3, i8 5, i8 5, i8 4, i8 4, i8 1, i8 1> 1969 ret <32 x i8> %shift 1970} 1971 1972define <32 x i8> @constant_shift_v32i8_quads(<32 x i8> %a) nounwind { 1973; AVX1-LABEL: constant_shift_v32i8_quads: 1974; AVX1: # %bb.0: 1975; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1976; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,1024,512,512,2048,2048,4096,4096] 1977; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1978; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 1979; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1980; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 1981; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [u,u,32768,32768,8192,8192,16384,16384] 1982; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 1983; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1984; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 1985; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1986; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 1987; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1988; AVX1-NEXT: retq 1989; 1990; AVX2-LABEL: constant_shift_v32i8_quads: 1991; AVX2: # %bb.0: 1992; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1993; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1994; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32,2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 1995; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 1996; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 1997; AVX2-NEXT: retq 1998; 1999; XOPAVX1-LABEL: constant_shift_v32i8_quads: 2000; XOPAVX1: # %bb.0: 2001; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2002; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2003; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2004; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2005; XOPAVX1-NEXT: retq 2006; 2007; XOPAVX2-LABEL: constant_shift_v32i8_quads: 2008; XOPAVX2: # %bb.0: 2009; XOPAVX2-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2010; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2011; XOPAVX2-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2012; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 2013; XOPAVX2-NEXT: retq 2014; 2015; AVX512-LABEL: constant_shift_v32i8_quads: 2016; AVX512: # %bb.0: 2017; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2018; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2019; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32,2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 2020; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 2021; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2022; AVX512-NEXT: retq 2023; 2024; AVX512VL-LABEL: constant_shift_v32i8_quads: 2025; AVX512VL: # %bb.0: 2026; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2027; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32,2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 2028; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 2029; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2030; AVX512VL-NEXT: retq 2031; 2032; X86-AVX1-LABEL: constant_shift_v32i8_quads: 2033; X86-AVX1: # %bb.0: 2034; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2035; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # [1024,1024,512,512,2048,2048,4096,4096] 2036; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 2037; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 2038; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 2039; X86-AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 2040; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 # [u,u,32768,32768,8192,8192,16384,16384] 2041; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 2042; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 2043; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32] 2044; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 2045; X86-AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 2046; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2047; X86-AVX1-NEXT: retl 2048; 2049; X86-AVX2-LABEL: constant_shift_v32i8_quads: 2050; X86-AVX2: # %bb.0: 2051; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 2052; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 2053; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32,2,2,2,2,1,1,1,1,4,4,4,4,8,8,8,8] 2054; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 2055; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2056; X86-AVX2-NEXT: retl 2057 %shift = ashr <32 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4> 2058 ret <32 x i8> %shift 2059} 2060 2061; 2062; Uniform Constant Shifts 2063; 2064 2065define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { 2066; AVX1-LABEL: splatconstant_shift_v4i64: 2067; AVX1: # %bb.0: 2068; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2069; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 2070; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 2071; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2072; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 2073; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 2074; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2075; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2076; AVX1-NEXT: retq 2077; 2078; AVX2-LABEL: splatconstant_shift_v4i64: 2079; AVX2: # %bb.0: 2080; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 2081; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 2082; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2083; AVX2-NEXT: retq 2084; 2085; XOPAVX1-LABEL: splatconstant_shift_v4i64: 2086; XOPAVX1: # %bb.0: 2087; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2088; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] 2089; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 2090; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 2091; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2092; XOPAVX1-NEXT: retq 2093; 2094; XOPAVX2-LABEL: splatconstant_shift_v4i64: 2095; XOPAVX2: # %bb.0: 2096; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1 2097; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 2098; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2099; XOPAVX2-NEXT: retq 2100; 2101; AVX512-LABEL: splatconstant_shift_v4i64: 2102; AVX512: # %bb.0: 2103; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2104; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 2105; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2106; AVX512-NEXT: retq 2107; 2108; AVX512VL-LABEL: splatconstant_shift_v4i64: 2109; AVX512VL: # %bb.0: 2110; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0 2111; AVX512VL-NEXT: retq 2112; 2113; X86-AVX1-LABEL: splatconstant_shift_v4i64: 2114; X86-AVX1: # %bb.0: 2115; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2116; X86-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 2117; X86-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 2118; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2119; X86-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 2120; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 2121; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2122; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2123; X86-AVX1-NEXT: retl 2124; 2125; X86-AVX2-LABEL: splatconstant_shift_v4i64: 2126; X86-AVX2: # %bb.0: 2127; X86-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 2128; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 2129; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2130; X86-AVX2-NEXT: retl 2131 %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 2132 ret <4 x i64> %shift 2133} 2134 2135define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { 2136; AVX1-LABEL: splatconstant_shift_v8i32: 2137; AVX1: # %bb.0: 2138; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 2139; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2140; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 2141; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2142; AVX1-NEXT: retq 2143; 2144; AVX2-LABEL: splatconstant_shift_v8i32: 2145; AVX2: # %bb.0: 2146; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 2147; AVX2-NEXT: retq 2148; 2149; XOPAVX1-LABEL: splatconstant_shift_v8i32: 2150; XOPAVX1: # %bb.0: 2151; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1 2152; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2153; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0 2154; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2155; XOPAVX1-NEXT: retq 2156; 2157; XOPAVX2-LABEL: splatconstant_shift_v8i32: 2158; XOPAVX2: # %bb.0: 2159; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0 2160; XOPAVX2-NEXT: retq 2161; 2162; AVX512-LABEL: splatconstant_shift_v8i32: 2163; AVX512: # %bb.0: 2164; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0 2165; AVX512-NEXT: retq 2166; 2167; AVX512VL-LABEL: splatconstant_shift_v8i32: 2168; AVX512VL: # %bb.0: 2169; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0 2170; AVX512VL-NEXT: retq 2171; 2172; X86-AVX1-LABEL: splatconstant_shift_v8i32: 2173; X86-AVX1: # %bb.0: 2174; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 2175; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2176; X86-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 2177; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2178; X86-AVX1-NEXT: retl 2179; 2180; X86-AVX2-LABEL: splatconstant_shift_v8i32: 2181; X86-AVX2: # %bb.0: 2182; X86-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 2183; X86-AVX2-NEXT: retl 2184 %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 2185 ret <8 x i32> %shift 2186} 2187 2188define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { 2189; AVX1-LABEL: splatconstant_shift_v16i16: 2190; AVX1: # %bb.0: 2191; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 2192; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2193; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 2194; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2195; AVX1-NEXT: retq 2196; 2197; AVX2-LABEL: splatconstant_shift_v16i16: 2198; AVX2: # %bb.0: 2199; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 2200; AVX2-NEXT: retq 2201; 2202; XOPAVX1-LABEL: splatconstant_shift_v16i16: 2203; XOPAVX1: # %bb.0: 2204; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1 2205; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2206; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0 2207; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2208; XOPAVX1-NEXT: retq 2209; 2210; XOPAVX2-LABEL: splatconstant_shift_v16i16: 2211; XOPAVX2: # %bb.0: 2212; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0 2213; XOPAVX2-NEXT: retq 2214; 2215; AVX512-LABEL: splatconstant_shift_v16i16: 2216; AVX512: # %bb.0: 2217; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0 2218; AVX512-NEXT: retq 2219; 2220; AVX512VL-LABEL: splatconstant_shift_v16i16: 2221; AVX512VL: # %bb.0: 2222; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0 2223; AVX512VL-NEXT: retq 2224; 2225; X86-AVX1-LABEL: splatconstant_shift_v16i16: 2226; X86-AVX1: # %bb.0: 2227; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 2228; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2229; X86-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 2230; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2231; X86-AVX1-NEXT: retl 2232; 2233; X86-AVX2-LABEL: splatconstant_shift_v16i16: 2234; X86-AVX2: # %bb.0: 2235; X86-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 2236; X86-AVX2-NEXT: retl 2237 %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 2238 ret <16 x i16> %shift 2239} 2240 2241define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { 2242; AVX1-LABEL: splatconstant_shift_v32i8: 2243; AVX1: # %bb.0: 2244; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2245; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 2246; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 2247; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2248; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2249; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 2250; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2251; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 2252; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2253; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 2254; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2255; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2256; AVX1-NEXT: retq 2257; 2258; AVX2-LABEL: splatconstant_shift_v32i8: 2259; AVX2: # %bb.0: 2260; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 2261; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2262; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2263; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 2264; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2265; AVX2-NEXT: retq 2266; 2267; XOPAVX1-LABEL: splatconstant_shift_v32i8: 2268; XOPAVX1: # %bb.0: 2269; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2270; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] 2271; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1 2272; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0 2273; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2274; XOPAVX1-NEXT: retq 2275; 2276; XOPAVX2-LABEL: splatconstant_shift_v32i8: 2277; XOPAVX2: # %bb.0: 2278; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 2279; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2280; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2281; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 2282; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2283; XOPAVX2-NEXT: retq 2284; 2285; AVX512-LABEL: splatconstant_shift_v32i8: 2286; AVX512: # %bb.0: 2287; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 2288; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2289; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2290; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 2291; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2292; AVX512-NEXT: retq 2293; 2294; AVX512DQVL-LABEL: splatconstant_shift_v32i8: 2295; AVX512DQVL: # %bb.0: 2296; AVX512DQVL-NEXT: vpsrlw $3, %ymm0, %ymm0 2297; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2298; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 2299; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2300; AVX512DQVL-NEXT: retq 2301; 2302; AVX512BWVL-LABEL: splatconstant_shift_v32i8: 2303; AVX512BWVL: # %bb.0: 2304; AVX512BWVL-NEXT: vpsrlw $3, %ymm0, %ymm0 2305; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2306; AVX512BWVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) 2307; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2308; AVX512BWVL-NEXT: retq 2309; 2310; X86-AVX1-LABEL: splatconstant_shift_v32i8: 2311; X86-AVX1: # %bb.0: 2312; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2313; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 2314; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 2315; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2316; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2317; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 2318; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 2319; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 2320; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2321; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 2322; X86-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 2323; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2324; X86-AVX1-NEXT: retl 2325; 2326; X86-AVX2-LABEL: splatconstant_shift_v32i8: 2327; X86-AVX2: # %bb.0: 2328; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 2329; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 2330; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2331; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 2332; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 2333; X86-AVX2-NEXT: retl 2334 %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 2335 ret <32 x i8> %shift 2336} 2337 2338; 2339; Special Cases 2340; 2341 2342define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { 2343; AVX1-LABEL: shift32_v4i64: 2344; AVX1: # %bb.0: 2345; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2346; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 2347; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2348; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2349; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 2350; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2351; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2352; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2353; AVX1-NEXT: retq 2354; 2355; AVX2-LABEL: shift32_v4i64: 2356; AVX2: # %bb.0: 2357; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 2358; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2359; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2360; AVX2-NEXT: retq 2361; 2362; XOPAVX1-LABEL: shift32_v4i64: 2363; XOPAVX1: # %bb.0: 2364; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2365; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] 2366; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 2367; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 2368; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2369; XOPAVX1-NEXT: retq 2370; 2371; XOPAVX2-LABEL: shift32_v4i64: 2372; XOPAVX2: # %bb.0: 2373; XOPAVX2-NEXT: vpsrad $31, %ymm0, %ymm1 2374; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2375; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2376; XOPAVX2-NEXT: retq 2377; 2378; AVX512-LABEL: shift32_v4i64: 2379; AVX512: # %bb.0: 2380; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2381; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 2382; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2383; AVX512-NEXT: retq 2384; 2385; AVX512VL-LABEL: shift32_v4i64: 2386; AVX512VL: # %bb.0: 2387; AVX512VL-NEXT: vpsraq $32, %ymm0, %ymm0 2388; AVX512VL-NEXT: retq 2389; 2390; X86-AVX1-LABEL: shift32_v4i64: 2391; X86-AVX1: # %bb.0: 2392; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2393; X86-AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 2394; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2395; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2396; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 2397; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2398; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2399; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2400; X86-AVX1-NEXT: retl 2401; 2402; X86-AVX2-LABEL: shift32_v4i64: 2403; X86-AVX2: # %bb.0: 2404; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 2405; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2406; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 2407; X86-AVX2-NEXT: retl 2408 %shift = ashr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32> 2409 ret <4 x i64> %shift 2410} 2411 2412define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) { 2413; AVX1-LABEL: PR52719: 2414; AVX1: # %bb.0: 2415; AVX1-NEXT: vmovd %edi, %xmm1 2416; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 2417; AVX1-NEXT: # xmm2 = mem[0,0] 2418; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 2419; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2420; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 2421; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 2422; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 2423; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 2424; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 2425; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 2426; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2427; AVX1-NEXT: retq 2428; 2429; AVX2-LABEL: PR52719: 2430; AVX2: # %bb.0: 2431; AVX2-NEXT: vmovd %edi, %xmm1 2432; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 2433; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 2434; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2435; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 2436; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 2437; AVX2-NEXT: retq 2438; 2439; XOPAVX1-LABEL: PR52719: 2440; XOPAVX1: # %bb.0: 2441; XOPAVX1-NEXT: vmovd %edi, %xmm1 2442; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 2443; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2444; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2445; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 2446; XOPAVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 2447; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 2448; XOPAVX1-NEXT: vpshaq %xmm3, %xmm4, %xmm3 2449; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 2450; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 2451; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2452; XOPAVX1-NEXT: retq 2453; 2454; XOPAVX2-LABEL: PR52719: 2455; XOPAVX2: # %bb.0: 2456; XOPAVX2-NEXT: vmovd %edi, %xmm1 2457; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 2458; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 2459; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2460; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 2461; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 2462; XOPAVX2-NEXT: retq 2463; 2464; AVX512-LABEL: PR52719: 2465; AVX512: # %bb.0: 2466; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2467; AVX512-NEXT: vmovd %edi, %xmm1 2468; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 2469; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2470; AVX512-NEXT: retq 2471; 2472; AVX512VL-LABEL: PR52719: 2473; AVX512VL: # %bb.0: 2474; AVX512VL-NEXT: vmovd %edi, %xmm1 2475; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 2476; AVX512VL-NEXT: retq 2477; 2478; X86-AVX1-LABEL: PR52719: 2479; X86-AVX1: # %bb.0: 2480; X86-AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2481; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 2482; X86-AVX1-NEXT: # xmm2 = mem[0,0] 2483; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 2484; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2485; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 2486; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 2487; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 2488; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 2489; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 2490; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 2491; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2492; X86-AVX1-NEXT: retl 2493; 2494; X86-AVX2-LABEL: PR52719: 2495; X86-AVX2: # %bb.0: 2496; X86-AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2497; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] 2498; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 2499; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 2500; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 2501; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 2502; X86-AVX2-NEXT: retl 2503 %vec = insertelement <4 x i32> poison, i32 %a1, i64 0 2504 %splat = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer 2505 %zext = zext <4 x i32> %splat to <4 x i64> 2506 %ashr = ashr <4 x i64> %a0, %zext 2507 ret <4 x i64> %ashr 2508} 2509