1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL 10; 11; 32-bit runs to make sure we do reasonable things for i64 shifts. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX1 13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2 14 15; 16; Variable Shifts 17; 18 19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 20; AVX1-LABEL: var_shift_v4i64: 21; AVX1: # %bb.0: 22; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 23; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 24; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 25; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 26; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 27; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 28; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 29; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 30; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 31; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 32; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 33; AVX1-NEXT: retq 34; 35; AVX2-LABEL: var_shift_v4i64: 36; AVX2: # %bb.0: 37; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 38; AVX2-NEXT: retq 39; 40; XOPAVX1-LABEL: var_shift_v4i64: 41; XOPAVX1: # %bb.0: 42; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 43; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 44; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 45; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 46; XOPAVX1-NEXT: vpshlq %xmm2, %xmm4, %xmm2 47; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 48; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 49; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 50; XOPAVX1-NEXT: retq 51; 52; XOPAVX2-LABEL: var_shift_v4i64: 53; XOPAVX2: # %bb.0: 54; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 55; XOPAVX2-NEXT: retq 56; 57; AVX512-LABEL: var_shift_v4i64: 58; AVX512: # %bb.0: 59; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 60; AVX512-NEXT: retq 61; 62; AVX512VL-LABEL: var_shift_v4i64: 63; AVX512VL: # %bb.0: 64; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 65; AVX512VL-NEXT: retq 66; 67; X86-AVX1-LABEL: var_shift_v4i64: 68; X86-AVX1: # %bb.0: 69; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 70; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 71; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 72; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 73; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 74; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 75; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 76; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 77; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 78; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 79; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 80; X86-AVX1-NEXT: retl 81; 82; X86-AVX2-LABEL: var_shift_v4i64: 83; X86-AVX2: # %bb.0: 84; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 85; X86-AVX2-NEXT: retl 86 %shift = lshr <4 x i64> %a, %b 87 ret <4 x i64> %shift 88} 89 90define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 91; AVX1-LABEL: var_shift_v8i32: 92; AVX1: # %bb.0: 93; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 94; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 95; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 96; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 97; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 98; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 99; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 100; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 101; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 102; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 103; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 104; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 105; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 106; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 107; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 108; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 109; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 110; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 111; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 112; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 113; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 114; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 115; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 116; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 117; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 118; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 119; AVX1-NEXT: retq 120; 121; AVX2-LABEL: var_shift_v8i32: 122; AVX2: # %bb.0: 123; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 124; AVX2-NEXT: retq 125; 126; XOPAVX1-LABEL: var_shift_v8i32: 127; XOPAVX1: # %bb.0: 128; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 129; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 130; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 131; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 132; XOPAVX1-NEXT: vpshld %xmm2, %xmm4, %xmm2 133; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 134; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 135; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 136; XOPAVX1-NEXT: retq 137; 138; XOPAVX2-LABEL: var_shift_v8i32: 139; XOPAVX2: # %bb.0: 140; XOPAVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 141; XOPAVX2-NEXT: retq 142; 143; AVX512-LABEL: var_shift_v8i32: 144; AVX512: # %bb.0: 145; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 146; AVX512-NEXT: retq 147; 148; AVX512VL-LABEL: var_shift_v8i32: 149; AVX512VL: # %bb.0: 150; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 151; AVX512VL-NEXT: retq 152; 153; X86-AVX1-LABEL: var_shift_v8i32: 154; X86-AVX1: # %bb.0: 155; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 156; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 157; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 158; X86-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 159; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 160; X86-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 161; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 162; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 163; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 164; X86-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 165; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 166; X86-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 167; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 168; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 169; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 170; X86-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 171; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 172; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 173; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 174; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 175; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 176; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 177; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 178; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 179; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 180; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 181; X86-AVX1-NEXT: retl 182; 183; X86-AVX2-LABEL: var_shift_v8i32: 184; X86-AVX2: # %bb.0: 185; X86-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 186; X86-AVX2-NEXT: retl 187 %shift = lshr <8 x i32> %a, %b 188 ret <8 x i32> %shift 189} 190 191define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 192; AVX1-LABEL: var_shift_v16i16: 193; AVX1: # %bb.0: 194; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 195; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 196; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 197; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 198; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 199; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 200; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 201; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 202; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 203; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 204; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 205; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 206; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 207; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 208; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 209; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 210; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 211; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 212; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 213; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 214; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 215; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 216; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 217; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 218; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 219; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 220; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 221; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 222; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 223; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 224; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 225; AVX1-NEXT: retq 226; 227; AVX2-LABEL: var_shift_v16i16: 228; AVX2: # %bb.0: 229; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 230; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 231; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 232; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 233; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 234; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 235; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 236; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 237; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 238; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 239; AVX2-NEXT: retq 240; 241; XOPAVX1-LABEL: var_shift_v16i16: 242; XOPAVX1: # %bb.0: 243; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 244; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 245; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 246; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 247; XOPAVX1-NEXT: vpshlw %xmm2, %xmm4, %xmm2 248; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 249; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 250; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 251; XOPAVX1-NEXT: retq 252; 253; XOPAVX2-LABEL: var_shift_v16i16: 254; XOPAVX2: # %bb.0: 255; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 256; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 257; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 258; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 259; XOPAVX2-NEXT: vpshlw %xmm2, %xmm4, %xmm2 260; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 261; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 262; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 263; XOPAVX2-NEXT: retq 264; 265; AVX512DQ-LABEL: var_shift_v16i16: 266; AVX512DQ: # %bb.0: 267; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 268; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 269; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 270; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 271; AVX512DQ-NEXT: retq 272; 273; AVX512BW-LABEL: var_shift_v16i16: 274; AVX512BW: # %bb.0: 275; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 276; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 277; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 278; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 279; AVX512BW-NEXT: retq 280; 281; AVX512DQVL-LABEL: var_shift_v16i16: 282; AVX512DQVL: # %bb.0: 283; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 284; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 285; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 286; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0 287; AVX512DQVL-NEXT: retq 288; 289; AVX512BWVL-LABEL: var_shift_v16i16: 290; AVX512BWVL: # %bb.0: 291; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 292; AVX512BWVL-NEXT: retq 293; 294; X86-AVX1-LABEL: var_shift_v16i16: 295; X86-AVX1: # %bb.0: 296; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 297; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 298; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 299; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 300; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 301; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 302; X86-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 303; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 304; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 305; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 306; X86-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 307; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 308; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 309; X86-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 310; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 311; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 312; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 313; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 314; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 315; X86-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 316; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 317; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 318; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 319; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 320; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 321; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 322; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 323; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 324; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 325; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 326; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 327; X86-AVX1-NEXT: retl 328; 329; X86-AVX2-LABEL: var_shift_v16i16: 330; X86-AVX2: # %bb.0: 331; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 332; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 333; X86-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 334; X86-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 335; X86-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 336; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 337; X86-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 338; X86-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 339; X86-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 340; X86-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 341; X86-AVX2-NEXT: retl 342 %shift = lshr <16 x i16> %a, %b 343 ret <16 x i16> %shift 344} 345 346define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 347; AVX1-LABEL: var_shift_v32i8: 348; AVX1: # %bb.0: 349; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 350; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 351; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 352; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 353; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 354; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 355; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 356; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 357; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 358; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 359; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 360; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 361; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 362; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 363; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 364; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 365; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 366; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 367; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 368; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 369; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 370; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 371; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 372; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 373; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 374; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 375; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 376; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 377; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 378; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 379; AVX1-NEXT: retq 380; 381; AVX2-LABEL: var_shift_v32i8: 382; AVX2: # %bb.0: 383; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 384; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 385; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 386; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 387; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 388; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 389; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 390; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 391; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 392; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 393; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 394; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 395; AVX2-NEXT: retq 396; 397; XOPAVX1-LABEL: var_shift_v32i8: 398; XOPAVX1: # %bb.0: 399; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 400; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 401; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 402; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 403; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm2 404; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 405; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 406; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 407; XOPAVX1-NEXT: retq 408; 409; XOPAVX2-LABEL: var_shift_v32i8: 410; XOPAVX2: # %bb.0: 411; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 412; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 413; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 414; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 415; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2 416; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 417; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 418; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 419; XOPAVX2-NEXT: retq 420; 421; AVX512DQ-LABEL: var_shift_v32i8: 422; AVX512DQ: # %bb.0: 423; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 424; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 425; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 426; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 427; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 428; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 429; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 430; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 431; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2 432; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 433; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 434; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 435; AVX512DQ-NEXT: retq 436; 437; AVX512BW-LABEL: var_shift_v32i8: 438; AVX512BW: # %bb.0: 439; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 440; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 441; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 442; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 443; AVX512BW-NEXT: retq 444; 445; AVX512DQVL-LABEL: var_shift_v32i8: 446; AVX512DQVL: # %bb.0: 447; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 448; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2 449; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 450; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 451; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2 452; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 453; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 454; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 455; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2 456; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 457; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 458; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 459; AVX512DQVL-NEXT: retq 460; 461; AVX512BWVL-LABEL: var_shift_v32i8: 462; AVX512BWVL: # %bb.0: 463; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 464; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 465; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 466; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 467; AVX512BWVL-NEXT: retq 468; 469; X86-AVX1-LABEL: var_shift_v32i8: 470; X86-AVX1: # %bb.0: 471; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 472; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 473; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 474; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 475; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 476; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 477; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 478; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 479; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 480; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 481; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 482; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 483; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 484; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 485; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 486; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 487; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 488; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 489; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 490; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 491; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 492; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 493; X86-AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 494; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 495; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 496; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 497; X86-AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 498; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 499; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 500; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 501; X86-AVX1-NEXT: retl 502; 503; X86-AVX2-LABEL: var_shift_v32i8: 504; X86-AVX2: # %bb.0: 505; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 506; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 507; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 508; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 509; X86-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 510; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 511; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 512; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 513; X86-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 514; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 515; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 516; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 517; X86-AVX2-NEXT: retl 518 %shift = lshr <32 x i8> %a, %b 519 ret <32 x i8> %shift 520} 521 522; 523; Uniform Variable Shifts 524; 525 526define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 527; AVX1-LABEL: splatvar_shift_v4i64: 528; AVX1: # %bb.0: 529; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 530; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 531; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 532; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 533; AVX1-NEXT: retq 534; 535; AVX2-LABEL: splatvar_shift_v4i64: 536; AVX2: # %bb.0: 537; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 538; AVX2-NEXT: retq 539; 540; XOPAVX1-LABEL: splatvar_shift_v4i64: 541; XOPAVX1: # %bb.0: 542; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 543; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 544; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 545; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 546; XOPAVX1-NEXT: retq 547; 548; XOPAVX2-LABEL: splatvar_shift_v4i64: 549; XOPAVX2: # %bb.0: 550; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 551; XOPAVX2-NEXT: retq 552; 553; AVX512-LABEL: splatvar_shift_v4i64: 554; AVX512: # %bb.0: 555; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 556; AVX512-NEXT: retq 557; 558; AVX512VL-LABEL: splatvar_shift_v4i64: 559; AVX512VL: # %bb.0: 560; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 561; AVX512VL-NEXT: retq 562; 563; X86-AVX1-LABEL: splatvar_shift_v4i64: 564; X86-AVX1: # %bb.0: 565; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 566; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 567; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 568; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 569; X86-AVX1-NEXT: retl 570; 571; X86-AVX2-LABEL: splatvar_shift_v4i64: 572; X86-AVX2: # %bb.0: 573; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 574; X86-AVX2-NEXT: retl 575 %splat = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer 576 %shift = lshr <4 x i64> %a, %splat 577 ret <4 x i64> %shift 578} 579 580define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 581; AVX1-LABEL: splatvar_shift_v8i32: 582; AVX1: # %bb.0: 583; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 584; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 585; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 586; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 587; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 588; AVX1-NEXT: retq 589; 590; AVX2-LABEL: splatvar_shift_v8i32: 591; AVX2: # %bb.0: 592; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 593; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 594; AVX2-NEXT: retq 595; 596; XOPAVX1-LABEL: splatvar_shift_v8i32: 597; XOPAVX1: # %bb.0: 598; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 599; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 600; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 601; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 602; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 603; XOPAVX1-NEXT: retq 604; 605; XOPAVX2-LABEL: splatvar_shift_v8i32: 606; XOPAVX2: # %bb.0: 607; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 608; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 609; XOPAVX2-NEXT: retq 610; 611; AVX512-LABEL: splatvar_shift_v8i32: 612; AVX512: # %bb.0: 613; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 614; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 615; AVX512-NEXT: retq 616; 617; AVX512VL-LABEL: splatvar_shift_v8i32: 618; AVX512VL: # %bb.0: 619; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 620; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 621; AVX512VL-NEXT: retq 622; 623; X86-AVX1-LABEL: splatvar_shift_v8i32: 624; X86-AVX1: # %bb.0: 625; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 626; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 627; X86-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 628; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 629; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 630; X86-AVX1-NEXT: retl 631; 632; X86-AVX2-LABEL: splatvar_shift_v8i32: 633; X86-AVX2: # %bb.0: 634; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 635; X86-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 636; X86-AVX2-NEXT: retl 637 %splat = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer 638 %shift = lshr <8 x i32> %a, %splat 639 ret <8 x i32> %shift 640} 641 642define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 643; AVX1-LABEL: splatvar_shift_v16i16: 644; AVX1: # %bb.0: 645; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 646; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 647; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 648; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 649; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 650; AVX1-NEXT: retq 651; 652; AVX2-LABEL: splatvar_shift_v16i16: 653; AVX2: # %bb.0: 654; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 655; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 656; AVX2-NEXT: retq 657; 658; XOPAVX1-LABEL: splatvar_shift_v16i16: 659; XOPAVX1: # %bb.0: 660; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 661; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 662; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 663; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 664; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 665; XOPAVX1-NEXT: retq 666; 667; XOPAVX2-LABEL: splatvar_shift_v16i16: 668; XOPAVX2: # %bb.0: 669; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 670; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 671; XOPAVX2-NEXT: retq 672; 673; AVX512-LABEL: splatvar_shift_v16i16: 674; AVX512: # %bb.0: 675; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 676; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 677; AVX512-NEXT: retq 678; 679; AVX512VL-LABEL: splatvar_shift_v16i16: 680; AVX512VL: # %bb.0: 681; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 682; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 683; AVX512VL-NEXT: retq 684; 685; X86-AVX1-LABEL: splatvar_shift_v16i16: 686; X86-AVX1: # %bb.0: 687; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 688; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 689; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 690; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 691; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 692; X86-AVX1-NEXT: retl 693; 694; X86-AVX2-LABEL: splatvar_shift_v16i16: 695; X86-AVX2: # %bb.0: 696; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 697; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 698; X86-AVX2-NEXT: retl 699 %splat = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer 700 %shift = lshr <16 x i16> %a, %splat 701 ret <16 x i16> %shift 702} 703 704define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 705; AVX1-LABEL: splatvar_shift_v32i8: 706; AVX1: # %bb.0: 707; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 708; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 709; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 710; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 711; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 712; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 713; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 714; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 715; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 716; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 717; AVX1-NEXT: retq 718; 719; AVX2-LABEL: splatvar_shift_v32i8: 720; AVX2: # %bb.0: 721; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 722; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 723; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 724; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 725; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 726; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 727; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 728; AVX2-NEXT: retq 729; 730; XOPAVX1-LABEL: splatvar_shift_v32i8: 731; XOPAVX1: # %bb.0: 732; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 733; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 734; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 735; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 736; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 737; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 738; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 739; XOPAVX1-NEXT: retq 740; 741; XOPAVX2-LABEL: splatvar_shift_v32i8: 742; XOPAVX2: # %bb.0: 743; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 744; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 745; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 746; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 747; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 748; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 749; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 750; XOPAVX2-NEXT: retq 751; 752; AVX512DQ-LABEL: splatvar_shift_v32i8: 753; AVX512DQ: # %bb.0: 754; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 755; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 756; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 757; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 758; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 759; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 760; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 761; AVX512DQ-NEXT: retq 762; 763; AVX512BW-LABEL: splatvar_shift_v32i8: 764; AVX512BW: # %bb.0: 765; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 766; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 767; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 768; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 769; AVX512BW-NEXT: retq 770; 771; AVX512DQVL-LABEL: splatvar_shift_v32i8: 772; AVX512DQVL: # %bb.0: 773; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 774; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 775; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 776; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 777; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 778; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 779; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 780; AVX512DQVL-NEXT: retq 781; 782; AVX512BWVL-LABEL: splatvar_shift_v32i8: 783; AVX512BWVL: # %bb.0: 784; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 785; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 786; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 787; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 788; AVX512BWVL-NEXT: retq 789; 790; X86-AVX1-LABEL: splatvar_shift_v32i8: 791; X86-AVX1: # %bb.0: 792; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 793; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 794; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 795; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 796; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 797; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 798; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 799; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 800; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 801; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 802; X86-AVX1-NEXT: retl 803; 804; X86-AVX2-LABEL: splatvar_shift_v32i8: 805; X86-AVX2: # %bb.0: 806; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 807; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 808; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 809; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 810; X86-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 811; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 812; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 813; X86-AVX2-NEXT: retl 814 %splat = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer 815 %shift = lshr <32 x i8> %a, %splat 816 ret <32 x i8> %shift 817} 818 819; 820; Uniform Variable Modulo Shifts 821; 822 823define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 824; AVX1-LABEL: splatvar_modulo_shift_v4i64: 825; AVX1: # %bb.0: 826; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 827; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 828; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 829; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 830; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 831; AVX1-NEXT: retq 832; 833; AVX2-LABEL: splatvar_modulo_shift_v4i64: 834; AVX2: # %bb.0: 835; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 836; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 837; AVX2-NEXT: retq 838; 839; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: 840; XOPAVX1: # %bb.0: 841; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 842; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 843; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 844; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 845; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 846; XOPAVX1-NEXT: retq 847; 848; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: 849; XOPAVX2: # %bb.0: 850; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 851; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 852; XOPAVX2-NEXT: retq 853; 854; AVX512-LABEL: splatvar_modulo_shift_v4i64: 855; AVX512: # %bb.0: 856; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 857; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 858; AVX512-NEXT: retq 859; 860; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: 861; AVX512VL: # %bb.0: 862; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 863; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 864; AVX512VL-NEXT: retq 865; 866; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: 867; X86-AVX1: # %bb.0: 868; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 869; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 870; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 871; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 872; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 873; X86-AVX1-NEXT: retl 874; 875; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: 876; X86-AVX2: # %bb.0: 877; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 878; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 879; X86-AVX2-NEXT: retl 880 %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63> 881 %splat = shufflevector <4 x i64> %mod, <4 x i64> poison, <4 x i32> zeroinitializer 882 %shift = lshr <4 x i64> %a, %splat 883 ret <4 x i64> %shift 884} 885 886define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 887; AVX1-LABEL: splatvar_modulo_shift_v8i32: 888; AVX1: # %bb.0: 889; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 890; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 891; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 892; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 893; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 894; AVX1-NEXT: retq 895; 896; AVX2-LABEL: splatvar_modulo_shift_v8i32: 897; AVX2: # %bb.0: 898; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 899; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 900; AVX2-NEXT: retq 901; 902; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32: 903; XOPAVX1: # %bb.0: 904; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 905; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 906; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 907; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 908; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 909; XOPAVX1-NEXT: retq 910; 911; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32: 912; XOPAVX2: # %bb.0: 913; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 914; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 915; XOPAVX2-NEXT: retq 916; 917; AVX512-LABEL: splatvar_modulo_shift_v8i32: 918; AVX512: # %bb.0: 919; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 920; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 921; AVX512-NEXT: retq 922; 923; AVX512VL-LABEL: splatvar_modulo_shift_v8i32: 924; AVX512VL: # %bb.0: 925; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 926; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 927; AVX512VL-NEXT: retq 928; 929; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32: 930; X86-AVX1: # %bb.0: 931; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 932; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 933; X86-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 934; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 935; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 936; X86-AVX1-NEXT: retl 937; 938; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32: 939; X86-AVX2: # %bb.0: 940; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 941; X86-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 942; X86-AVX2-NEXT: retl 943 %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 944 %splat = shufflevector <8 x i32> %mod, <8 x i32> poison, <8 x i32> zeroinitializer 945 %shift = lshr <8 x i32> %a, %splat 946 ret <8 x i32> %shift 947} 948 949define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 950; AVX1-LABEL: splatvar_modulo_shift_v16i16: 951; AVX1: # %bb.0: 952; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 953; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 954; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 955; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 956; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 957; AVX1-NEXT: retq 958; 959; AVX2-LABEL: splatvar_modulo_shift_v16i16: 960; AVX2: # %bb.0: 961; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 962; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 963; AVX2-NEXT: retq 964; 965; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16: 966; XOPAVX1: # %bb.0: 967; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 968; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 969; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 970; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 971; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 972; XOPAVX1-NEXT: retq 973; 974; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16: 975; XOPAVX2: # %bb.0: 976; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 977; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 978; XOPAVX2-NEXT: retq 979; 980; AVX512-LABEL: splatvar_modulo_shift_v16i16: 981; AVX512: # %bb.0: 982; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 983; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 984; AVX512-NEXT: retq 985; 986; AVX512VL-LABEL: splatvar_modulo_shift_v16i16: 987; AVX512VL: # %bb.0: 988; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 989; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 990; AVX512VL-NEXT: retq 991; 992; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16: 993; X86-AVX1: # %bb.0: 994; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 995; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 996; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 997; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 998; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 999; X86-AVX1-NEXT: retl 1000; 1001; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16: 1002; X86-AVX2: # %bb.0: 1003; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1004; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1005; X86-AVX2-NEXT: retl 1006 %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 1007 %splat = shufflevector <16 x i16> %mod, <16 x i16> poison, <16 x i32> zeroinitializer 1008 %shift = lshr <16 x i16> %a, %splat 1009 ret <16 x i16> %shift 1010} 1011 1012define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 1013; AVX1-LABEL: splatvar_modulo_shift_v32i8: 1014; AVX1: # %bb.0: 1015; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1016; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1017; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1018; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1019; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 1020; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1021; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1022; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1023; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1024; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1025; AVX1-NEXT: retq 1026; 1027; AVX2-LABEL: splatvar_modulo_shift_v32i8: 1028; AVX2: # %bb.0: 1029; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1030; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1031; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1032; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1033; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 1034; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 1035; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1036; AVX2-NEXT: retq 1037; 1038; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: 1039; XOPAVX1: # %bb.0: 1040; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1041; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1042; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1043; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1044; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1045; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 1046; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1047; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1048; XOPAVX1-NEXT: retq 1049; 1050; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: 1051; XOPAVX2: # %bb.0: 1052; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1053; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1054; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1055; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1056; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1057; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 1058; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1059; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1060; XOPAVX2-NEXT: retq 1061; 1062; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8: 1063; AVX512DQ: # %bb.0: 1064; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1065; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1066; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1067; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1068; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 1069; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 1070; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1071; AVX512DQ-NEXT: retq 1072; 1073; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: 1074; AVX512BW: # %bb.0: 1075; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1076; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1077; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1078; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 1079; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1080; AVX512BW-NEXT: retq 1081; 1082; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8: 1083; AVX512DQVL: # %bb.0: 1084; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1085; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1086; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1087; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1088; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 1089; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 1090; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1091; AVX512DQVL-NEXT: retq 1092; 1093; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: 1094; AVX512BWVL: # %bb.0: 1095; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1096; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1097; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1098; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 1099; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1100; AVX512BWVL-NEXT: retq 1101; 1102; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8: 1103; X86-AVX1: # %bb.0: 1104; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1105; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1106; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 1107; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1108; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 1109; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1110; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1111; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1112; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1113; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1114; X86-AVX1-NEXT: retl 1115; 1116; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: 1117; X86-AVX2: # %bb.0: 1118; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1119; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1120; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1121; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 1122; X86-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 1123; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 1124; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1125; X86-AVX2-NEXT: retl 1126 %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 1127 %splat = shufflevector <32 x i8> %mod, <32 x i8> poison, <32 x i32> zeroinitializer 1128 %shift = lshr <32 x i8> %a, %splat 1129 ret <32 x i8> %shift 1130} 1131 1132; 1133; Constant Shifts 1134; 1135 1136define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { 1137; AVX1-LABEL: constant_shift_v4i64: 1138; AVX1: # %bb.0: 1139; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1140; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1141; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1142; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1143; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1144; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1145; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1146; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1147; AVX1-NEXT: retq 1148; 1149; AVX2-LABEL: constant_shift_v4i64: 1150; AVX2: # %bb.0: 1151; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1152; AVX2-NEXT: retq 1153; 1154; XOPAVX1-LABEL: constant_shift_v4i64: 1155; XOPAVX1: # %bb.0: 1156; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1157; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1158; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1159; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1160; XOPAVX1-NEXT: retq 1161; 1162; XOPAVX2-LABEL: constant_shift_v4i64: 1163; XOPAVX2: # %bb.0: 1164; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1165; XOPAVX2-NEXT: retq 1166; 1167; AVX512-LABEL: constant_shift_v4i64: 1168; AVX512: # %bb.0: 1169; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1170; AVX512-NEXT: retq 1171; 1172; AVX512VL-LABEL: constant_shift_v4i64: 1173; AVX512VL: # %bb.0: 1174; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1175; AVX512VL-NEXT: retq 1176; 1177; X86-AVX1-LABEL: constant_shift_v4i64: 1178; X86-AVX1: # %bb.0: 1179; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1180; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 1181; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 1182; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1183; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 1184; X86-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1185; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1186; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1187; X86-AVX1-NEXT: retl 1188; 1189; X86-AVX2-LABEL: constant_shift_v4i64: 1190; X86-AVX2: # %bb.0: 1191; X86-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1192; X86-AVX2-NEXT: retl 1193 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> 1194 ret <4 x i64> %shift 1195} 1196 1197define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { 1198; AVX1-LABEL: constant_shift_v8i32: 1199; AVX1: # %bb.0: 1200; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 1201; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 1202; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1203; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 1204; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 1205; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1206; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1207; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1208; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 1209; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 1210; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1211; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 1212; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1213; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1214; AVX1-NEXT: retq 1215; 1216; AVX2-LABEL: constant_shift_v8i32: 1217; AVX2: # %bb.0: 1218; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1219; AVX2-NEXT: retq 1220; 1221; XOPAVX1-LABEL: constant_shift_v8i32: 1222; XOPAVX1: # %bb.0: 1223; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1224; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1225; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1226; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1227; XOPAVX1-NEXT: retq 1228; 1229; XOPAVX2-LABEL: constant_shift_v8i32: 1230; XOPAVX2: # %bb.0: 1231; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1232; XOPAVX2-NEXT: retq 1233; 1234; AVX512-LABEL: constant_shift_v8i32: 1235; AVX512: # %bb.0: 1236; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1237; AVX512-NEXT: retq 1238; 1239; AVX512VL-LABEL: constant_shift_v8i32: 1240; AVX512VL: # %bb.0: 1241; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1242; AVX512VL-NEXT: retq 1243; 1244; X86-AVX1-LABEL: constant_shift_v8i32: 1245; X86-AVX1: # %bb.0: 1246; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 1247; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 1248; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1249; X86-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 1250; X86-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 1251; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1252; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1253; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1254; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 1255; X86-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 1256; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1257; X86-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 1258; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1259; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1260; X86-AVX1-NEXT: retl 1261; 1262; X86-AVX2-LABEL: constant_shift_v8i32: 1263; X86-AVX2: # %bb.0: 1264; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1265; X86-AVX2-NEXT: retl 1266 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 1267 ret <8 x i32> %shift 1268} 1269 1270define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind { 1271; AVX1-LABEL: constant_shift_v16i16_pairs: 1272; AVX1: # %bb.0: 1273; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,32768,32768,16384,16384,8192,8192] 1274; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1275; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1276; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4096,4096,2048,2048,1024,1024,512,512] 1277; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1278; AVX1-NEXT: retq 1279; 1280; AVX2-LABEL: constant_shift_v16i16_pairs: 1281; AVX2: # %bb.0: 1282; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1283; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1284; AVX2-NEXT: retq 1285; 1286; XOPAVX1-LABEL: constant_shift_v16i16_pairs: 1287; XOPAVX1: # %bb.0: 1288; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1289; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1290; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1291; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1292; XOPAVX1-NEXT: retq 1293; 1294; XOPAVX2-LABEL: constant_shift_v16i16_pairs: 1295; XOPAVX2: # %bb.0: 1296; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,u,32768,32768,16384,16384,8192,8192,4096,4096,2048,2048,1024,1024,512,512] 1297; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] 1298; XOPAVX2-NEXT: retq 1299; 1300; AVX512DQ-LABEL: constant_shift_v16i16_pairs: 1301; AVX512DQ: # %bb.0: 1302; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1303; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1304; AVX512DQ-NEXT: retq 1305; 1306; AVX512BW-LABEL: constant_shift_v16i16_pairs: 1307; AVX512BW: # %bb.0: 1308; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1309; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1310; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1311; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1312; AVX512BW-NEXT: retq 1313; 1314; AVX512DQVL-LABEL: constant_shift_v16i16_pairs: 1315; AVX512DQVL: # %bb.0: 1316; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1317; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1318; AVX512DQVL-NEXT: retq 1319; 1320; AVX512BWVL-LABEL: constant_shift_v16i16_pairs: 1321; AVX512BWVL: # %bb.0: 1322; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1323; AVX512BWVL-NEXT: retq 1324; 1325; X86-AVX1-LABEL: constant_shift_v16i16_pairs: 1326; X86-AVX1: # %bb.0: 1327; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,u,32768,32768,16384,16384,8192,8192] 1328; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1329; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1330; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [4096,4096,2048,2048,1024,1024,512,512] 1331; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1332; X86-AVX1-NEXT: retl 1333; 1334; X86-AVX2-LABEL: constant_shift_v16i16_pairs: 1335; X86-AVX2: # %bb.0: 1336; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1337; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1338; X86-AVX2-NEXT: retl 1339 %shift = lshr <16 x i16> %a, <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3, i16 4, i16 4, i16 5, i16 5, i16 6, i16 6, i16 7, i16 7> 1340 ret <16 x i16> %shift 1341} 1342 1343define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { 1344; AVX1-LABEL: constant_shift_v16i16: 1345; AVX1: # %bb.0: 1346; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512] 1347; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1348; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1349; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1350; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1351; AVX1-NEXT: retq 1352; 1353; AVX2-LABEL: constant_shift_v16i16: 1354; AVX2: # %bb.0: 1355; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1356; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1357; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1358; AVX2-NEXT: retq 1359; 1360; XOPAVX1-LABEL: constant_shift_v16i16: 1361; XOPAVX1: # %bb.0: 1362; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1363; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1364; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1365; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1366; XOPAVX1-NEXT: retq 1367; 1368; XOPAVX2-LABEL: constant_shift_v16i16: 1369; XOPAVX2: # %bb.0: 1370; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1371; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1372; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1373; XOPAVX2-NEXT: retq 1374; 1375; AVX512DQ-LABEL: constant_shift_v16i16: 1376; AVX512DQ: # %bb.0: 1377; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1378; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1379; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1380; AVX512DQ-NEXT: retq 1381; 1382; AVX512BW-LABEL: constant_shift_v16i16: 1383; AVX512BW: # %bb.0: 1384; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1385; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1386; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1387; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1388; AVX512BW-NEXT: retq 1389; 1390; AVX512DQVL-LABEL: constant_shift_v16i16: 1391; AVX512DQVL: # %bb.0: 1392; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1393; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1394; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1395; AVX512DQVL-NEXT: retq 1396; 1397; AVX512BWVL-LABEL: constant_shift_v16i16: 1398; AVX512BWVL: # %bb.0: 1399; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1400; AVX512BWVL-NEXT: retq 1401; 1402; X86-AVX1-LABEL: constant_shift_v16i16: 1403; X86-AVX1: # %bb.0: 1404; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512] 1405; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1406; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1407; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] 1408; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1409; X86-AVX1-NEXT: retl 1410; 1411; X86-AVX2-LABEL: constant_shift_v16i16: 1412; X86-AVX2: # %bb.0: 1413; X86-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 1414; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1415; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1416; X86-AVX2-NEXT: retl 1417 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1418 ret <16 x i16> %shift 1419} 1420 1421define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { 1422; AVX1-LABEL: constant_shift_v32i8_pairs: 1423; AVX1: # %bb.0: 1424; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1425; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096] 1426; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1 1427; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15] 1428; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1429; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0 1430; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1431; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1432; AVX1-NEXT: retq 1433; 1434; AVX2-LABEL: constant_shift_v32i8_pairs: 1435; AVX2: # %bb.0: 1436; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] 1437; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1438; AVX2-NEXT: retq 1439; 1440; XOPAVX1-LABEL: constant_shift_v32i8_pairs: 1441; XOPAVX1: # %bb.0: 1442; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1443; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] 1444; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1445; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1446; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1447; XOPAVX1-NEXT: retq 1448; 1449; XOPAVX2-LABEL: constant_shift_v32i8_pairs: 1450; XOPAVX2: # %bb.0: 1451; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1452; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] 1453; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1454; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1455; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1456; XOPAVX2-NEXT: retq 1457; 1458; AVX512DQ-LABEL: constant_shift_v32i8_pairs: 1459; AVX512DQ: # %bb.0: 1460; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] 1461; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1462; AVX512DQ-NEXT: retq 1463; 1464; AVX512BW-LABEL: constant_shift_v32i8_pairs: 1465; AVX512BW: # %bb.0: 1466; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1467; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,2,4,6,1,2,3,4,7,2,4,6,1,2,3,4] 1468; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] 1469; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1470; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1471; AVX512BW-NEXT: retq 1472; 1473; AVX512DQVL-LABEL: constant_shift_v32i8_pairs: 1474; AVX512DQVL: # %bb.0: 1475; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] 1476; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1477; AVX512DQVL-NEXT: retq 1478; 1479; AVX512BWVL-LABEL: constant_shift_v32i8_pairs: 1480; AVX512BWVL: # %bb.0: 1481; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1482; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1483; AVX512BWVL-NEXT: retq 1484; 1485; X86-AVX1-LABEL: constant_shift_v32i8_pairs: 1486; X86-AVX1: # %bb.0: 1487; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1488; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096] 1489; X86-AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1 1490; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15] 1491; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1492; X86-AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0 1493; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1494; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1495; X86-AVX1-NEXT: retl 1496; 1497; X86-AVX2-LABEL: constant_shift_v32i8_pairs: 1498; X86-AVX2: # %bb.0: 1499; X86-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] 1500; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1501; X86-AVX2-NEXT: retl 1502 %shift = lshr <32 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4> 1503 ret <32 x i8> %shift 1504} 1505 1506define <32 x i8> @constant_shift_v32i8_quads(<32 x i8> %a) nounwind { 1507; AVX1-LABEL: constant_shift_v32i8_quads: 1508; AVX1: # %bb.0: 1509; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [8192,8192,16384,16384,32768,32768,u,u] 1510; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1511; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1512; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1513; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,512,1024,1024,2048,2048,4096,4096] 1514; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1515; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1516; AVX1-NEXT: retq 1517; 1518; AVX2-LABEL: constant_shift_v32i8_quads: 1519; AVX2: # %bb.0: 1520; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1521; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1522; AVX2-NEXT: retq 1523; 1524; XOPAVX1-LABEL: constant_shift_v32i8_quads: 1525; XOPAVX1: # %bb.0: 1526; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1527; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1528; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1529; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1530; XOPAVX1-NEXT: retq 1531; 1532; XOPAVX2-LABEL: constant_shift_v32i8_quads: 1533; XOPAVX2: # %bb.0: 1534; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1535; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1536; XOPAVX2-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1537; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1538; XOPAVX2-NEXT: retq 1539; 1540; AVX512-LABEL: constant_shift_v32i8_quads: 1541; AVX512: # %bb.0: 1542; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1543; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1544; AVX512-NEXT: retq 1545; 1546; AVX512VL-LABEL: constant_shift_v32i8_quads: 1547; AVX512VL: # %bb.0: 1548; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1549; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1550; AVX512VL-NEXT: retq 1551; 1552; X86-AVX1-LABEL: constant_shift_v32i8_quads: 1553; X86-AVX1: # %bb.0: 1554; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [8192,8192,16384,16384,32768,32768,u,u] 1555; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1556; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 1557; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1558; X86-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [512,512,1024,1024,2048,2048,4096,4096] 1559; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 1560; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1561; X86-AVX1-NEXT: retl 1562; 1563; X86-AVX2-LABEL: constant_shift_v32i8_quads: 1564; X86-AVX2: # %bb.0: 1565; X86-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1566; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1567; X86-AVX2-NEXT: retl 1568 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 7, i8 7, i8 7, i8 7, i8 6, i8 6, i8 6, i8 6, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4> 1569 ret <32 x i8> %shift 1570} 1571 1572define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { 1573; AVX1-LABEL: constant_shift_v32i8: 1574; AVX1: # %bb.0: 1575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1576; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1577; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1578; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256] 1579; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1580; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1581; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1582; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2] 1583; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 1584; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1585; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 1586; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1587; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1588; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1589; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1590; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 1591; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1592; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1593; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1594; AVX1-NEXT: retq 1595; 1596; AVX2-LABEL: constant_shift_v32i8: 1597; AVX2: # %bb.0: 1598; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1599; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1600; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1601; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 1602; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1603; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1604; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1605; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1606; AVX2-NEXT: retq 1607; 1608; XOPAVX1-LABEL: constant_shift_v32i8: 1609; XOPAVX1: # %bb.0: 1610; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1611; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1612; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1613; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1614; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1615; XOPAVX1-NEXT: retq 1616; 1617; XOPAVX2-LABEL: constant_shift_v32i8: 1618; XOPAVX2: # %bb.0: 1619; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1620; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0] 1621; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1622; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1623; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1624; XOPAVX2-NEXT: retq 1625; 1626; AVX512DQ-LABEL: constant_shift_v32i8: 1627; AVX512DQ: # %bb.0: 1628; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 1629; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1630; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1631; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 1632; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1633; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1634; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 1635; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1636; AVX512DQ-NEXT: retq 1637; 1638; AVX512BW-LABEL: constant_shift_v32i8: 1639; AVX512BW: # %bb.0: 1640; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1641; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1642; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1643; AVX512BW-NEXT: retq 1644; 1645; AVX512DQVL-LABEL: constant_shift_v32i8: 1646; AVX512DQVL: # %bb.0: 1647; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1648; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1649; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1650; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 1651; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1652; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1653; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1654; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1655; AVX512DQVL-NEXT: retq 1656; 1657; AVX512BWVL-LABEL: constant_shift_v32i8: 1658; AVX512BWVL: # %bb.0: 1659; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1660; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1661; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1662; AVX512BWVL-NEXT: retq 1663; 1664; X86-AVX1-LABEL: constant_shift_v32i8: 1665; X86-AVX1: # %bb.0: 1666; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1667; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1668; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1669; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256] 1670; X86-AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1671; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1672; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1673; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2] 1674; X86-AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 1675; X86-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1676; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 1677; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1678; X86-AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1679; X86-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1680; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1681; X86-AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 1682; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1683; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1684; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1685; X86-AVX1-NEXT: retl 1686; 1687; X86-AVX2-LABEL: constant_shift_v32i8: 1688; X86-AVX2: # %bb.0: 1689; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1690; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1691; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 1692; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 1693; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1694; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 1695; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1696; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1697; X86-AVX2-NEXT: retl 1698 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1699 ret <32 x i8> %shift 1700} 1701 1702; 1703; Uniform Constant Shifts 1704; 1705 1706define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { 1707; AVX1-LABEL: splatconstant_shift_v4i64: 1708; AVX1: # %bb.0: 1709; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1710; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1711; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1712; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1713; AVX1-NEXT: retq 1714; 1715; AVX2-LABEL: splatconstant_shift_v4i64: 1716; AVX2: # %bb.0: 1717; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1718; AVX2-NEXT: retq 1719; 1720; XOPAVX1-LABEL: splatconstant_shift_v4i64: 1721; XOPAVX1: # %bb.0: 1722; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1723; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1724; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1725; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1726; XOPAVX1-NEXT: retq 1727; 1728; XOPAVX2-LABEL: splatconstant_shift_v4i64: 1729; XOPAVX2: # %bb.0: 1730; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1731; XOPAVX2-NEXT: retq 1732; 1733; AVX512-LABEL: splatconstant_shift_v4i64: 1734; AVX512: # %bb.0: 1735; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0 1736; AVX512-NEXT: retq 1737; 1738; AVX512VL-LABEL: splatconstant_shift_v4i64: 1739; AVX512VL: # %bb.0: 1740; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 1741; AVX512VL-NEXT: retq 1742; 1743; X86-AVX1-LABEL: splatconstant_shift_v4i64: 1744; X86-AVX1: # %bb.0: 1745; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1746; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1747; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1748; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1749; X86-AVX1-NEXT: retl 1750; 1751; X86-AVX2-LABEL: splatconstant_shift_v4i64: 1752; X86-AVX2: # %bb.0: 1753; X86-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 1754; X86-AVX2-NEXT: retl 1755 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 1756 ret <4 x i64> %shift 1757} 1758 1759define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { 1760; AVX1-LABEL: splatconstant_shift_v8i32: 1761; AVX1: # %bb.0: 1762; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 1763; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1764; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 1765; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1766; AVX1-NEXT: retq 1767; 1768; AVX2-LABEL: splatconstant_shift_v8i32: 1769; AVX2: # %bb.0: 1770; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 1771; AVX2-NEXT: retq 1772; 1773; XOPAVX1-LABEL: splatconstant_shift_v8i32: 1774; XOPAVX1: # %bb.0: 1775; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm1 1776; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1777; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm0 1778; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1779; XOPAVX1-NEXT: retq 1780; 1781; XOPAVX2-LABEL: splatconstant_shift_v8i32: 1782; XOPAVX2: # %bb.0: 1783; XOPAVX2-NEXT: vpsrld $5, %ymm0, %ymm0 1784; XOPAVX2-NEXT: retq 1785; 1786; AVX512-LABEL: splatconstant_shift_v8i32: 1787; AVX512: # %bb.0: 1788; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0 1789; AVX512-NEXT: retq 1790; 1791; AVX512VL-LABEL: splatconstant_shift_v8i32: 1792; AVX512VL: # %bb.0: 1793; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0 1794; AVX512VL-NEXT: retq 1795; 1796; X86-AVX1-LABEL: splatconstant_shift_v8i32: 1797; X86-AVX1: # %bb.0: 1798; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 1799; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1800; X86-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 1801; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1802; X86-AVX1-NEXT: retl 1803; 1804; X86-AVX2-LABEL: splatconstant_shift_v8i32: 1805; X86-AVX2: # %bb.0: 1806; X86-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 1807; X86-AVX2-NEXT: retl 1808 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 1809 ret <8 x i32> %shift 1810} 1811 1812define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { 1813; AVX1-LABEL: splatconstant_shift_v16i16: 1814; AVX1: # %bb.0: 1815; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 1816; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1817; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1818; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1819; AVX1-NEXT: retq 1820; 1821; AVX2-LABEL: splatconstant_shift_v16i16: 1822; AVX2: # %bb.0: 1823; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1824; AVX2-NEXT: retq 1825; 1826; XOPAVX1-LABEL: splatconstant_shift_v16i16: 1827; XOPAVX1: # %bb.0: 1828; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 1829; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1830; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1831; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1832; XOPAVX1-NEXT: retq 1833; 1834; XOPAVX2-LABEL: splatconstant_shift_v16i16: 1835; XOPAVX2: # %bb.0: 1836; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1837; XOPAVX2-NEXT: retq 1838; 1839; AVX512-LABEL: splatconstant_shift_v16i16: 1840; AVX512: # %bb.0: 1841; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 1842; AVX512-NEXT: retq 1843; 1844; AVX512VL-LABEL: splatconstant_shift_v16i16: 1845; AVX512VL: # %bb.0: 1846; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 1847; AVX512VL-NEXT: retq 1848; 1849; X86-AVX1-LABEL: splatconstant_shift_v16i16: 1850; X86-AVX1: # %bb.0: 1851; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 1852; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1853; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1854; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1855; X86-AVX1-NEXT: retl 1856; 1857; X86-AVX2-LABEL: splatconstant_shift_v16i16: 1858; X86-AVX2: # %bb.0: 1859; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1860; X86-AVX2-NEXT: retl 1861 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1862 ret <16 x i16> %shift 1863} 1864 1865define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { 1866; AVX1-LABEL: splatconstant_shift_v32i8: 1867; AVX1: # %bb.0: 1868; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1869; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 1870; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 1871; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1872; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1873; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1874; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1875; AVX1-NEXT: retq 1876; 1877; AVX2-LABEL: splatconstant_shift_v32i8: 1878; AVX2: # %bb.0: 1879; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1880; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1881; AVX2-NEXT: retq 1882; 1883; XOPAVX1-LABEL: splatconstant_shift_v32i8: 1884; XOPAVX1: # %bb.0: 1885; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1886; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] 1887; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1888; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1889; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1890; XOPAVX1-NEXT: retq 1891; 1892; XOPAVX2-LABEL: splatconstant_shift_v32i8: 1893; XOPAVX2: # %bb.0: 1894; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1895; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1896; XOPAVX2-NEXT: retq 1897; 1898; AVX512-LABEL: splatconstant_shift_v32i8: 1899; AVX512: # %bb.0: 1900; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 1901; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1902; AVX512-NEXT: retq 1903; 1904; AVX512VL-LABEL: splatconstant_shift_v32i8: 1905; AVX512VL: # %bb.0: 1906; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 1907; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 1908; AVX512VL-NEXT: retq 1909; 1910; X86-AVX1-LABEL: splatconstant_shift_v32i8: 1911; X86-AVX1: # %bb.0: 1912; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1913; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 1914; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 1915; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1916; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 1917; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1918; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1919; X86-AVX1-NEXT: retl 1920; 1921; X86-AVX2-LABEL: splatconstant_shift_v32i8: 1922; X86-AVX2: # %bb.0: 1923; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 1924; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 1925; X86-AVX2-NEXT: retl 1926 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1927 ret <32 x i8> %shift 1928} 1929 1930; 1931; Special Cases 1932; 1933 1934define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { 1935; AVX1-LABEL: shift32_v4i64: 1936; AVX1: # %bb.0: 1937; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1938; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1939; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1940; AVX1-NEXT: retq 1941; 1942; AVX2-LABEL: shift32_v4i64: 1943; AVX2: # %bb.0: 1944; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 1945; AVX2-NEXT: retq 1946; 1947; XOPAVX1-LABEL: shift32_v4i64: 1948; XOPAVX1: # %bb.0: 1949; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1950; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1951; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1952; XOPAVX1-NEXT: retq 1953; 1954; XOPAVX2-LABEL: shift32_v4i64: 1955; XOPAVX2: # %bb.0: 1956; XOPAVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 1957; XOPAVX2-NEXT: retq 1958; 1959; AVX512-LABEL: shift32_v4i64: 1960; AVX512: # %bb.0: 1961; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 1962; AVX512-NEXT: retq 1963; 1964; AVX512VL-LABEL: shift32_v4i64: 1965; AVX512VL: # %bb.0: 1966; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 1967; AVX512VL-NEXT: retq 1968; 1969; X86-AVX1-LABEL: shift32_v4i64: 1970; X86-AVX1: # %bb.0: 1971; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1972; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1973; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] 1974; X86-AVX1-NEXT: retl 1975; 1976; X86-AVX2-LABEL: shift32_v4i64: 1977; X86-AVX2: # %bb.0: 1978; X86-AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 1979; X86-AVX2-NEXT: retl 1980 %shift = lshr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32> 1981 ret <4 x i64> %shift 1982} 1983 1984define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) { 1985; AVX1-LABEL: sh_trunc_sh_vec: 1986; AVX1: # %bb.0: 1987; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1988; AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 1989; AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 1990; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1991; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1992; AVX1-NEXT: vzeroupper 1993; AVX1-NEXT: retq 1994; 1995; AVX2-LABEL: sh_trunc_sh_vec: 1996; AVX2: # %bb.0: 1997; AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 1998; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1999; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2000; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] 2001; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 2002; AVX2-NEXT: vzeroupper 2003; AVX2-NEXT: retq 2004; 2005; XOPAVX1-LABEL: sh_trunc_sh_vec: 2006; XOPAVX1: # %bb.0: 2007; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2008; XOPAVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 2009; XOPAVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 2010; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2011; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2012; XOPAVX1-NEXT: vzeroupper 2013; XOPAVX1-NEXT: retq 2014; 2015; XOPAVX2-LABEL: sh_trunc_sh_vec: 2016; XOPAVX2: # %bb.0: 2017; XOPAVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 2018; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2019; XOPAVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2020; XOPAVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] 2021; XOPAVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 2022; XOPAVX2-NEXT: vzeroupper 2023; XOPAVX2-NEXT: retq 2024; 2025; AVX512-LABEL: sh_trunc_sh_vec: 2026; AVX512: # %bb.0: 2027; AVX512-NEXT: vpsrlq $36, %ymm0, %ymm0 2028; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2029; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] 2030; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 2031; AVX512-NEXT: vzeroupper 2032; AVX512-NEXT: retq 2033; 2034; AVX512VL-LABEL: sh_trunc_sh_vec: 2035; AVX512VL: # %bb.0: 2036; AVX512VL-NEXT: vpsrlq $36, %ymm0, %ymm0 2037; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 2038; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 2039; AVX512VL-NEXT: vzeroupper 2040; AVX512VL-NEXT: retq 2041; 2042; X86-AVX1-LABEL: sh_trunc_sh_vec: 2043; X86-AVX1: # %bb.0: 2044; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2045; X86-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 2046; X86-AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 2047; X86-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2048; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 2049; X86-AVX1-NEXT: vzeroupper 2050; X86-AVX1-NEXT: retl 2051; 2052; X86-AVX2-LABEL: sh_trunc_sh_vec: 2053; X86-AVX2: # %bb.0: 2054; X86-AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 2055; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2056; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2057; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] 2058; X86-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 2059; X86-AVX2-NEXT: vzeroupper 2060; X86-AVX2-NEXT: retl 2061 %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24> 2062 %t = trunc <4 x i64> %s to <4 x i32> 2063 %r = lshr <4 x i32> %t, <i32 12, i32 12, i32 12, i32 12> 2064 ret <4 x i32> %r 2065} 2066