1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 4 5; 6; Variable Shifts 7; 8 9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 10; ALL-LABEL: var_shift_v8i64: 11; ALL: # %bb.0: 12; ALL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 13; ALL-NEXT: retq 14 %shift = ashr <8 x i64> %a, %b 15 ret <8 x i64> %shift 16} 17 18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 19; ALL-LABEL: var_shift_v16i32: 20; ALL: # %bb.0: 21; ALL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 22; ALL-NEXT: retq 23 %shift = ashr <16 x i32> %a, %b 24 ret <16 x i32> %shift 25} 26 27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 28; AVX512DQ-LABEL: var_shift_v32i16: 29; AVX512DQ: # %bb.0: 30; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 31; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm3 32; AVX512DQ-NEXT: vpsravd %zmm2, %zmm3, %zmm2 33; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 34; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 35; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 36; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 37; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 38; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 39; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 40; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 41; AVX512DQ-NEXT: retq 42; 43; AVX512BW-LABEL: var_shift_v32i16: 44; AVX512BW: # %bb.0: 45; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 46; AVX512BW-NEXT: retq 47 %shift = ashr <32 x i16> %a, %b 48 ret <32 x i16> %shift 49} 50 51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 52; AVX512DQ-LABEL: var_shift_v64i8: 53; AVX512DQ: # %bb.0: 54; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 55; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 56; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 57; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 58; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 59; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6 60; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 61; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6 62; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 63; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 64; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6 65; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 66; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 67; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 68; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 69; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 70; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 71; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 72; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 73; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 74; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 75; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5 76; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 77; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm2 78; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 79; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 80; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 81; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 82; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 83; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 84; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 85; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 86; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 87; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 88; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5 89; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 90; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 91; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 92; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 93; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 94; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm4 95; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 96; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm4 97; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 98; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 99; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm4 100; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 101; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 102; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 103; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 104; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 105; AVX512DQ-NEXT: retq 106; 107; AVX512BW-LABEL: var_shift_v64i8: 108; AVX512BW: # %bb.0: 109; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 110; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 111; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 112; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 113; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 114; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 115; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 116; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4 117; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 118; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 119; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 120; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4 121; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 122; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 123; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 124; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 125; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 126; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 127; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 128; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 129; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 130; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1 131; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 132; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 133; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 134; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1 135; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 136; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 137; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 138; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 139; AVX512BW-NEXT: retq 140 %shift = ashr <64 x i8> %a, %b 141 ret <64 x i8> %shift 142} 143 144; 145; Uniform Variable Shifts 146; 147 148define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 149; ALL-LABEL: splatvar_shift_v8i64: 150; ALL: # %bb.0: 151; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 152; ALL-NEXT: retq 153 %splat = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> zeroinitializer 154 %shift = ashr <8 x i64> %a, %splat 155 ret <8 x i64> %shift 156} 157 158define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 159; ALL-LABEL: splatvar_shift_v16i32: 160; ALL: # %bb.0: 161; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 162; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 163; ALL-NEXT: retq 164 %splat = shufflevector <16 x i32> %b, <16 x i32> poison, <16 x i32> zeroinitializer 165 %shift = ashr <16 x i32> %a, %splat 166 ret <16 x i32> %shift 167} 168 169define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 170; AVX512DQ-LABEL: splatvar_shift_v32i16: 171; AVX512DQ: # %bb.0: 172; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 173; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 174; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 175; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 176; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 177; AVX512DQ-NEXT: retq 178; 179; AVX512BW-LABEL: splatvar_shift_v32i16: 180; AVX512BW: # %bb.0: 181; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 182; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 183; AVX512BW-NEXT: retq 184 %splat = shufflevector <32 x i16> %b, <32 x i16> poison, <32 x i32> zeroinitializer 185 %shift = ashr <32 x i16> %a, %splat 186 ret <32 x i16> %shift 187} 188 189define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 190; AVX512DQ-LABEL: splatvar_shift_v64i8: 191; AVX512DQ: # %bb.0: 192; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 193; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 194; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 195; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 196; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 197; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 198; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 199; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 200; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 201; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 202; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 203; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 204; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 205; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 206; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0 207; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0 208; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 209; AVX512DQ-NEXT: retq 210; 211; AVX512BW-LABEL: splatvar_shift_v64i8: 212; AVX512BW: # %bb.0: 213; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 214; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 215; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 216; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 217; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 218; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 219; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 220; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 221; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm1 & zmm0) 222; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0 223; AVX512BW-NEXT: retq 224 %splat = shufflevector <64 x i8> %b, <64 x i8> poison, <64 x i32> zeroinitializer 225 %shift = ashr <64 x i8> %a, %splat 226 ret <64 x i8> %shift 227} 228 229; 230; Uniform Variable Modulo Shifts 231; 232 233define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 234; ALL-LABEL: splatvar_modulo_shift_v8i64: 235; ALL: # %bb.0: 236; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 237; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 238; ALL-NEXT: retq 239 %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63> 240 %splat = shufflevector <8 x i64> %mod, <8 x i64> poison, <8 x i32> zeroinitializer 241 %shift = ashr <8 x i64> %a, %splat 242 ret <8 x i64> %shift 243} 244 245define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 246; ALL-LABEL: splatvar_modulo_shift_v16i32: 247; ALL: # %bb.0: 248; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 249; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 250; ALL-NEXT: retq 251 %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 252 %splat = shufflevector <16 x i32> %mod, <16 x i32> poison, <16 x i32> zeroinitializer 253 %shift = ashr <16 x i32> %a, %splat 254 ret <16 x i32> %shift 255} 256 257define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 258; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: 259; AVX512DQ: # %bb.0: 260; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 261; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 262; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 263; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 264; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 265; AVX512DQ-NEXT: retq 266; 267; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: 268; AVX512BW: # %bb.0: 269; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 270; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 271; AVX512BW-NEXT: retq 272 %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 273 %splat = shufflevector <32 x i16> %mod, <32 x i16> poison, <32 x i32> zeroinitializer 274 %shift = ashr <32 x i16> %a, %splat 275 ret <32 x i16> %shift 276} 277 278define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 279; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: 280; AVX512DQ: # %bb.0: 281; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 282; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 283; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 284; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 285; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 286; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 287; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 288; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 289; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 290; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 291; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 292; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 293; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 294; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 295; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0 296; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0 297; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 298; AVX512DQ-NEXT: retq 299; 300; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: 301; AVX512BW: # %bb.0: 302; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 303; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 304; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 305; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 306; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 307; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 308; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 309; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 310; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm1 & zmm0) 311; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0 312; AVX512BW-NEXT: retq 313 %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 314 %splat = shufflevector <64 x i8> %mod, <64 x i8> poison, <64 x i32> zeroinitializer 315 %shift = ashr <64 x i8> %a, %splat 316 ret <64 x i8> %shift 317} 318 319; 320; Constant Shifts 321; 322 323define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { 324; ALL-LABEL: constant_shift_v8i64: 325; ALL: # %bb.0: 326; ALL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 327; ALL-NEXT: retq 328 %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62> 329 ret <8 x i64> %shift 330} 331 332define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { 333; ALL-LABEL: constant_shift_v16i32: 334; ALL: # %bb.0: 335; ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 336; ALL-NEXT: retq 337 %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 338 ret <16 x i32> %shift 339} 340 341define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { 342; AVX512DQ-LABEL: constant_shift_v32i16: 343; AVX512DQ: # %bb.0: 344; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 345; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 346; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 347; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 348; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 349; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 350; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 351; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 352; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 353; AVX512DQ-NEXT: retq 354; 355; AVX512BW-LABEL: constant_shift_v32i16: 356; AVX512BW: # %bb.0: 357; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 358; AVX512BW-NEXT: retq 359 %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 360 ret <32 x i16> %shift 361} 362 363define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind { 364; AVX512DQ-LABEL: constant_shift_v32i16_pairs: 365; AVX512DQ: # %bb.0: 366; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 367; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [128,128,128,128,64,64,64,64,32,32,32,32,16,16,16,16,8,8,8,8,4,4,4,4,2,2,2,2,1,1,1,1] 368; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) 369; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm1 370; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 371; AVX512DQ-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 372; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 373; AVX512DQ-NEXT: retq 374; 375; AVX512BW-LABEL: constant_shift_v32i16_pairs: 376; AVX512BW: # %bb.0: 377; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 378; AVX512BW-NEXT: retq 379 %shift = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 9, i16 9, i16 9, i16 9, i16 10, i16 10, i16 10, i16 10, i16 11, i16 11, i16 11, i16 11, i16 12, i16 12, i16 12, i16 12, i16 13, i16 13, i16 13, i16 13, i16 14, i16 14, i16 14, i16 14, i16 15, i16 15, i16 15, i16 15> 380 ret <32 x i16> %shift 381} 382 383define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { 384; AVX512DQ-LABEL: constant_shift_v64i8: 385; AVX512DQ: # %bb.0: 386; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 387; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 388; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 389; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 390; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 391; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 392; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 393; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 394; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 395; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 396; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] 397; AVX512DQ-NEXT: vpmullw %ymm4, %ymm1, %ymm1 398; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 399; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 400; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 401; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 402; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 403; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 404; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 405; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 406; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0 407; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 408; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 409; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 410; AVX512DQ-NEXT: retq 411; 412; AVX512BW-LABEL: constant_shift_v64i8: 413; AVX512BW: # %bb.0: 414; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 415; AVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1 416; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 417; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 418; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 419; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 420; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 421; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 422; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 423; AVX512BW-NEXT: retq 424 %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 425 ret <64 x i8> %shift 426} 427 428define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { 429; AVX512DQ-LABEL: constant_shift_v64i8_pairs: 430; AVX512DQ: # %bb.0: 431; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1024,1024,16384,16384,1024,4096,4096,2048,1024,32768,8192,16384,4096,512,2048,u] 432; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] 433; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 434; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 435; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,32,32,32,32,2,2,8,8,8,8,4,4,2,2,64,64,16,16,32,32,8,8,1,1,4,4,128,128] 436; AVX512DQ-NEXT: vpxor %ymm2, %ymm1, %ymm1 437; AVX512DQ-NEXT: vpsubb %ymm2, %ymm1, %ymm1 438; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 439; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [512,32768,u,512,4096,u,32768,8192,32768,4096,4096,8192,1024,1024,2048,1024] 440; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] 441; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 442; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 443; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,64,64,128,128,1,1,8,8,128,128,64,64,16,16,64,64,8,8,8,8,16,16,2,2,2,2,4,4,2,2] 444; AVX512DQ-NEXT: vpxor %ymm2, %ymm0, %ymm0 445; AVX512DQ-NEXT: vpsubb %ymm2, %ymm0, %ymm0 446; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 447; AVX512DQ-NEXT: retq 448; 449; AVX512BW-LABEL: constant_shift_v64i8_pairs: 450; AVX512BW: # %bb.0: 451; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 452; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,32,32,32,32,2,2,8,8,8,8,4,4,2,2,64,64,16,16,32,32,8,8,1,1,4,4,128,128,1,1,64,64,128,128,1,1,8,8,128,128,64,64,16,16,64,64,8,8,8,8,16,16,2,2,2,2,4,4,2,2] 453; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) 454; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 455; AVX512BW-NEXT: retq 456 %shift = ashr <64 x i8> %a, <i8 6, i8 6, i8 6, i8 6, i8 2, i8 2, i8 2, i8 2, i8 6, i8 6, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 1, i8 1, i8 3, i8 3, i8 2, i8 2, i8 4, i8 4, i8 7, i8 7, i8 5, i8 5, i8 0, i8 0, i8 7, i8 7, i8 1, i8 1, i8 0, i8 0, i8 7, i8 7, i8 4, i8 4, i8 0, i8 0, i8 1, i8 1, i8 3, i8 3, i8 1, i8 1, i8 4, i8 4, i8 4, i8 4, i8 3, i8 3, i8 6, i8 6, i8 6, i8 6, i8 5, i8 5, i8 6, i8 6> 457 ret <64 x i8> %shift 458} 459 460define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind { 461; AVX512DQ-LABEL: constant_shift_v64i8_quads: 462; AVX512DQ: # %bb.0: 463; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 464; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,4,4,4,4,32,32,32,32,1,1,1,1,1,1,1,1,4,4,4,4,1,1,1,1,4,4,4,4,8,8,8,8,16,16,16,16,16,16,16,16,2,2,2,2,64,64,64,64,4,4,4,4,32,32,32,32,128,128,128,128] 465; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) 466; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm1 467; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 468; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 469; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 470; AVX512DQ-NEXT: retq 471; 472; AVX512BW-LABEL: constant_shift_v64i8_quads: 473; AVX512BW: # %bb.0: 474; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 475; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,4,4,4,4,32,32,32,32,1,1,1,1,1,1,1,1,4,4,4,4,1,1,1,1,4,4,4,4,8,8,8,8,16,16,16,16,16,16,16,16,2,2,2,2,64,64,64,64,4,4,4,4,32,32,32,32,128,128,128,128] 476; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) 477; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 478; AVX512BW-NEXT: retq 479 %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 5, i8 5, i8 5, i8 5, i8 2, i8 2, i8 2, i8 2, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 6, i8 6, i8 6, i8 6, i8 1, i8 1, i8 1, i8 1, i8 5, i8 5, i8 5, i8 5, i8 2, i8 2, i8 2, i8 2, i8 0, i8 0, i8 0, i8 0> 480 ret <64 x i8> %shift 481} 482 483; 484; Uniform Constant Shifts 485; 486 487define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { 488; ALL-LABEL: splatconstant_shift_v8i64: 489; ALL: # %bb.0: 490; ALL-NEXT: vpsraq $7, %zmm0, %zmm0 491; ALL-NEXT: retq 492 %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 493 ret <8 x i64> %shift 494} 495 496define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { 497; ALL-LABEL: splatconstant_shift_v16i32: 498; ALL: # %bb.0: 499; ALL-NEXT: vpsrad $5, %zmm0, %zmm0 500; ALL-NEXT: retq 501 %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 502 ret <16 x i32> %shift 503} 504 505define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { 506; AVX512DQ-LABEL: splatconstant_shift_v32i16: 507; AVX512DQ: # %bb.0: 508; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm1 509; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 510; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 511; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 512; AVX512DQ-NEXT: retq 513; 514; AVX512BW-LABEL: splatconstant_shift_v32i16: 515; AVX512BW: # %bb.0: 516; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 517; AVX512BW-NEXT: retq 518 %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 519 ret <32 x i16> %shift 520} 521 522define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { 523; AVX512DQ-LABEL: splatconstant_shift_v64i8: 524; AVX512DQ: # %bb.0: 525; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 526; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 527; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 528; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 529; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 530; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 531; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 532; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 533; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 534; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0 535; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 536; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 537; AVX512DQ-NEXT: retq 538; 539; AVX512BW-LABEL: splatconstant_shift_v64i8: 540; AVX512BW: # %bb.0: 541; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 542; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 543; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) 544; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 545; AVX512BW-NEXT: retq 546 %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 547 ret <64 x i8> %shift 548} 549 550define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) { 551; AVX512DQ-LABEL: ashr_const7_v64i8: 552; AVX512DQ: # %bb.0: 553; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 554; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 555; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 556; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 557; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 558; AVX512DQ-NEXT: retq 559; 560; AVX512BW-LABEL: ashr_const7_v64i8: 561; AVX512BW: # %bb.0: 562; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 563; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 564; AVX512BW-NEXT: retq 565 %res = ashr <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 566 ret <64 x i8> %res 567} 568 569define <8 x i64> @PR52719(<8 x i64> %a0, i32 %a1) { 570; ALL-LABEL: PR52719: 571; ALL: # %bb.0: 572; ALL-NEXT: vmovd %edi, %xmm1 573; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 574; ALL-NEXT: retq 575 %vec = insertelement <8 x i32> poison, i32 %a1, i64 0 576 %splat = shufflevector <8 x i32> %vec, <8 x i32> poison, <8 x i32> zeroinitializer 577 %zext = zext <8 x i32> %splat to <8 x i64> 578 %ashr = ashr <8 x i64> %a0, %zext 579 ret <8 x i64> %ashr 580} 581