1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 4 5; 6; Variable Shifts 7; 8 9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 10; ALL-LABEL: var_shift_v8i64: 11; ALL: # %bb.0: 12; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 13; ALL-NEXT: retq 14 %shift = lshr <8 x i64> %a, %b 15 ret <8 x i64> %shift 16} 17 18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 19; ALL-LABEL: var_shift_v16i32: 20; ALL: # %bb.0: 21; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 22; ALL-NEXT: retq 23 %shift = lshr <16 x i32> %a, %b 24 ret <16 x i32> %shift 25} 26 27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 28; AVX512DQ-LABEL: var_shift_v32i16: 29; AVX512DQ: # %bb.0: 30; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 31; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 32; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2 33; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 34; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 35; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 36; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 37; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 38; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 39; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 40; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 41; AVX512DQ-NEXT: retq 42; 43; AVX512BW-LABEL: var_shift_v32i16: 44; AVX512BW: # %bb.0: 45; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 46; AVX512BW-NEXT: retq 47 %shift = lshr <32 x i16> %a, %b 48 ret <32 x i16> %shift 49} 50 51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 52; AVX512DQ-LABEL: var_shift_v64i8: 53; AVX512DQ: # %bb.0: 54; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 55; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3 56; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 57; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 58; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 59; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 60; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 61; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3 62; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 63; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 64; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 65; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 66; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3 67; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 68; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 69; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 70; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 71; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3 72; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 73; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 74; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 75; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3 76; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 77; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 78; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 79; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm3 80; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 81; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 82; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 83; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 84; AVX512DQ-NEXT: retq 85; 86; AVX512BW-LABEL: var_shift_v64i8: 87; AVX512BW: # %bb.0: 88; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 89; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 90; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 91; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 92; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 93; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 94; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 95; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 96; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 97; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 98; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 99; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 100; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 101; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 102; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 103; AVX512BW-NEXT: retq 104 %shift = lshr <64 x i8> %a, %b 105 ret <64 x i8> %shift 106} 107 108; 109; Uniform Variable Shifts 110; 111 112define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 113; ALL-LABEL: splatvar_shift_v8i64: 114; ALL: # %bb.0: 115; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 116; ALL-NEXT: retq 117 %splat = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> zeroinitializer 118 %shift = lshr <8 x i64> %a, %splat 119 ret <8 x i64> %shift 120} 121 122define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 123; ALL-LABEL: splatvar_shift_v16i32: 124; ALL: # %bb.0: 125; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 126; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 127; ALL-NEXT: retq 128 %splat = shufflevector <16 x i32> %b, <16 x i32> poison, <16 x i32> zeroinitializer 129 %shift = lshr <16 x i32> %a, %splat 130 ret <16 x i32> %shift 131} 132 133define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 134; AVX512DQ-LABEL: splatvar_shift_v32i16: 135; AVX512DQ: # %bb.0: 136; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 137; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 138; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 139; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 140; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 141; AVX512DQ-NEXT: retq 142; 143; AVX512BW-LABEL: splatvar_shift_v32i16: 144; AVX512BW: # %bb.0: 145; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 146; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 147; AVX512BW-NEXT: retq 148 %splat = shufflevector <32 x i16> %b, <32 x i16> poison, <32 x i32> zeroinitializer 149 %shift = lshr <32 x i16> %a, %splat 150 ret <32 x i16> %shift 151} 152 153define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 154; AVX512DQ-LABEL: splatvar_shift_v64i8: 155; AVX512DQ: # %bb.0: 156; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 157; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 158; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 159; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 160; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 161; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 162; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 163; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 164; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 165; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 166; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 167; AVX512DQ-NEXT: retq 168; 169; AVX512BW-LABEL: splatvar_shift_v64i8: 170; AVX512BW: # %bb.0: 171; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 172; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 173; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 174; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 175; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 176; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 177; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 178; AVX512BW-NEXT: retq 179 %splat = shufflevector <64 x i8> %b, <64 x i8> poison, <64 x i32> zeroinitializer 180 %shift = lshr <64 x i8> %a, %splat 181 ret <64 x i8> %shift 182} 183 184; 185; Uniform Variable Modulo Shifts 186; 187 188define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 189; ALL-LABEL: splatvar_modulo_shift_v8i64: 190; ALL: # %bb.0: 191; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 192; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 193; ALL-NEXT: retq 194 %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63> 195 %splat = shufflevector <8 x i64> %mod, <8 x i64> poison, <8 x i32> zeroinitializer 196 %shift = lshr <8 x i64> %a, %splat 197 ret <8 x i64> %shift 198} 199 200define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 201; ALL-LABEL: splatvar_modulo_shift_v16i32: 202; ALL: # %bb.0: 203; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 204; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 205; ALL-NEXT: retq 206 %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 207 %splat = shufflevector <16 x i32> %mod, <16 x i32> poison, <16 x i32> zeroinitializer 208 %shift = lshr <16 x i32> %a, %splat 209 ret <16 x i32> %shift 210} 211 212define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 213; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: 214; AVX512DQ: # %bb.0: 215; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 216; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 217; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 218; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 219; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 220; AVX512DQ-NEXT: retq 221; 222; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: 223; AVX512BW: # %bb.0: 224; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 225; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 226; AVX512BW-NEXT: retq 227 %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 228 %splat = shufflevector <32 x i16> %mod, <32 x i16> poison, <32 x i32> zeroinitializer 229 %shift = lshr <32 x i16> %a, %splat 230 ret <32 x i16> %shift 231} 232 233define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 234; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: 235; AVX512DQ: # %bb.0: 236; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 237; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 238; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 239; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 240; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 241; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 242; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 243; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 244; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 245; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 246; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 247; AVX512DQ-NEXT: retq 248; 249; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: 250; AVX512BW: # %bb.0: 251; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 252; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 253; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 254; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 255; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 256; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 257; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 258; AVX512BW-NEXT: retq 259 %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 260 %splat = shufflevector <64 x i8> %mod, <64 x i8> poison, <64 x i32> zeroinitializer 261 %shift = lshr <64 x i8> %a, %splat 262 ret <64 x i8> %shift 263} 264 265; 266; Constant Shifts 267; 268 269define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { 270; ALL-LABEL: constant_shift_v8i64: 271; ALL: # %bb.0: 272; ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 273; ALL-NEXT: retq 274 %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62> 275 ret <8 x i64> %shift 276} 277 278define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { 279; ALL-LABEL: constant_shift_v16i32: 280; ALL: # %bb.0: 281; ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 282; ALL-NEXT: retq 283 %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 284 ret <16 x i32> %shift 285} 286 287define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { 288; AVX512DQ-LABEL: constant_shift_v32i16: 289; AVX512DQ: # %bb.0: 290; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 291; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 292; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 293; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] 294; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 295; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 296; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 297; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 298; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 299; AVX512DQ-NEXT: retq 300; 301; AVX512BW-LABEL: constant_shift_v32i16: 302; AVX512BW: # %bb.0: 303; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 304; AVX512BW-NEXT: retq 305 %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 306 ret <32 x i16> %shift 307} 308 309define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind { 310; AVX512DQ-LABEL: constant_shift_v32i16_pairs: 311; AVX512DQ: # %bb.0: 312; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 313; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 314; AVX512DQ-NEXT: retq 315; 316; AVX512BW-LABEL: constant_shift_v32i16_pairs: 317; AVX512BW: # %bb.0: 318; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 319; AVX512BW-NEXT: retq 320 %shift = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 9, i16 9, i16 9, i16 9, i16 10, i16 10, i16 10, i16 10, i16 11, i16 11, i16 11, i16 11, i16 12, i16 12, i16 12, i16 12, i16 13, i16 13, i16 13, i16 13, i16 14, i16 14, i16 14, i16 14, i16 15, i16 15, i16 15, i16 15> 321 ret <32 x i16> %shift 322} 323 324define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { 325; AVX512DQ-LABEL: constant_shift_v64i8_pairs: 326; AVX512DQ: # %bb.0: 327; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 328; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] 329; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 330; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 331; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 332; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 333; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15] 334; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 335; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0 336; AVX512DQ-NEXT: retq 337; 338; AVX512BW-LABEL: constant_shift_v64i8_pairs: 339; AVX512BW: # %bb.0: 340; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 341; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 342; AVX512BW-NEXT: retq 343 %shift = lshr <64 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4> 344 ret <64 x i8> %shift 345} 346 347define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind { 348; ALL-LABEL: constant_shift_v64i8_quads: 349; ALL: # %bb.0: 350; ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 351; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 352; ALL-NEXT: retq 353 %shift = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3> 354 ret <64 x i8> %shift 355} 356 357define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { 358; AVX512DQ-LABEL: constant_shift_v64i8: 359; AVX512DQ: # %bb.0: 360; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 361; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 362; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 363; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 364; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] 365; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 366; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 367; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 368; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 369; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] 370; AVX512DQ-NEXT: vpmullw %ymm5, %ymm1, %ymm1 371; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 372; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 373; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 374; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 375; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 376; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 377; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0 378; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 379; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 380; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 381; AVX512DQ-NEXT: retq 382; 383; AVX512BW-LABEL: constant_shift_v64i8: 384; AVX512BW: # %bb.0: 385; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 386; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 387; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 388; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 389; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 390; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 391; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 392; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 393; AVX512BW-NEXT: retq 394 %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 395 ret <64 x i8> %shift 396} 397 398; 399; Uniform Constant Shifts 400; 401 402define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { 403; ALL-LABEL: splatconstant_shift_v8i64: 404; ALL: # %bb.0: 405; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0 406; ALL-NEXT: retq 407 %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 408 ret <8 x i64> %shift 409} 410 411define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { 412; ALL-LABEL: splatconstant_shift_v16i32: 413; ALL: # %bb.0: 414; ALL-NEXT: vpsrld $5, %zmm0, %zmm0 415; ALL-NEXT: retq 416 %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 417 ret <16 x i32> %shift 418} 419 420define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { 421; AVX512DQ-LABEL: splatconstant_shift_v32i16: 422; AVX512DQ: # %bb.0: 423; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1 424; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 425; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 426; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 427; AVX512DQ-NEXT: retq 428; 429; AVX512BW-LABEL: splatconstant_shift_v32i16: 430; AVX512BW: # %bb.0: 431; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 432; AVX512BW-NEXT: retq 433 %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 434 ret <32 x i16> %shift 435} 436 437define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { 438; AVX512DQ-LABEL: splatconstant_shift_v64i8: 439; AVX512DQ: # %bb.0: 440; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1 441; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 442; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 443; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 444; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 445; AVX512DQ-NEXT: retq 446; 447; AVX512BW-LABEL: splatconstant_shift_v64i8: 448; AVX512BW: # %bb.0: 449; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 450; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 451; AVX512BW-NEXT: retq 452 %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 453 ret <64 x i8> %shift 454} 455