1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 4 5; 6; Variable Shifts 7; 8 9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 10; ALL-LABEL: var_shift_v8i64: 11; ALL: # %bb.0: 12; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 13; ALL-NEXT: retq 14 %shift = shl <8 x i64> %a, %b 15 ret <8 x i64> %shift 16} 17 18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 19; ALL-LABEL: var_shift_v16i32: 20; ALL: # %bb.0: 21; ALL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 22; ALL-NEXT: retq 23 %shift = shl <16 x i32> %a, %b 24 ret <16 x i32> %shift 25} 26 27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 28; AVX512DQ-LABEL: var_shift_v32i16: 29; AVX512DQ: # %bb.0: 30; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 31; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 32; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm3, %zmm2 33; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 34; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 35; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 36; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 37; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 38; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 39; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 40; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 41; AVX512DQ-NEXT: retq 42; 43; AVX512BW-LABEL: var_shift_v32i16: 44; AVX512BW: # %bb.0: 45; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 46; AVX512BW-NEXT: retq 47 %shift = shl <32 x i16> %a, %b 48 ret <32 x i16> %shift 49} 50 51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 52; AVX512DQ-LABEL: var_shift_v64i8: 53; AVX512DQ: # %bb.0: 54; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 55; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3 56; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 57; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 58; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 59; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 60; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 61; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3 62; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 63; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 64; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 65; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 66; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm3 67; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 68; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 69; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3 70; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 71; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 72; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 73; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3 74; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 75; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 76; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 77; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm3 78; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 79; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 80; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 81; AVX512DQ-NEXT: retq 82; 83; AVX512BW-LABEL: var_shift_v64i8: 84; AVX512BW: # %bb.0: 85; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 86; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 87; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 88; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 89; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 90; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 91; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 92; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 93; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 94; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 95; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 96; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 97; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} 98; AVX512BW-NEXT: retq 99 %shift = shl <64 x i8> %a, %b 100 ret <64 x i8> %shift 101} 102 103; 104; Uniform Variable Shifts 105; 106 107define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 108; ALL-LABEL: splatvar_shift_v8i64: 109; ALL: # %bb.0: 110; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 111; ALL-NEXT: retq 112 %splat = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> zeroinitializer 113 %shift = shl <8 x i64> %a, %splat 114 ret <8 x i64> %shift 115} 116 117define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 118; ALL-LABEL: splatvar_shift_v16i32: 119; ALL: # %bb.0: 120; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 121; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0 122; ALL-NEXT: retq 123 %splat = shufflevector <16 x i32> %b, <16 x i32> poison, <16 x i32> zeroinitializer 124 %shift = shl <16 x i32> %a, %splat 125 ret <16 x i32> %shift 126} 127 128define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 129; AVX512DQ-LABEL: splatvar_shift_v32i16: 130; AVX512DQ: # %bb.0: 131; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 132; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 133; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 134; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 135; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 136; AVX512DQ-NEXT: retq 137; 138; AVX512BW-LABEL: splatvar_shift_v32i16: 139; AVX512BW: # %bb.0: 140; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 141; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 142; AVX512BW-NEXT: retq 143 %splat = shufflevector <32 x i16> %b, <32 x i16> poison, <32 x i32> zeroinitializer 144 %shift = shl <32 x i16> %a, %splat 145 ret <32 x i16> %shift 146} 147 148define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 149; AVX512DQ-LABEL: splatvar_shift_v64i8: 150; AVX512DQ: # %bb.0: 151; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 152; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 153; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 154; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 155; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 156; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 157; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1 158; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 159; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 160; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 161; AVX512DQ-NEXT: retq 162; 163; AVX512BW-LABEL: splatvar_shift_v64i8: 164; AVX512BW: # %bb.0: 165; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 166; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 167; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 168; AVX512BW-NEXT: vpsllw %xmm1, %xmm2, %xmm1 169; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 170; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 171; AVX512BW-NEXT: retq 172 %splat = shufflevector <64 x i8> %b, <64 x i8> poison, <64 x i32> zeroinitializer 173 %shift = shl <64 x i8> %a, %splat 174 ret <64 x i8> %shift 175} 176 177; 178; Uniform Variable Modulo Shifts 179; 180 181define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 182; ALL-LABEL: splatvar_modulo_shift_v8i64: 183; ALL: # %bb.0: 184; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 185; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 186; ALL-NEXT: retq 187 %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63> 188 %splat = shufflevector <8 x i64> %mod, <8 x i64> poison, <8 x i32> zeroinitializer 189 %shift = shl <8 x i64> %a, %splat 190 ret <8 x i64> %shift 191} 192 193define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 194; ALL-LABEL: splatvar_modulo_shift_v16i32: 195; ALL: # %bb.0: 196; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 197; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0 198; ALL-NEXT: retq 199 %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 200 %splat = shufflevector <16 x i32> %mod, <16 x i32> poison, <16 x i32> zeroinitializer 201 %shift = shl <16 x i32> %a, %splat 202 ret <16 x i32> %shift 203} 204 205define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 206; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: 207; AVX512DQ: # %bb.0: 208; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 209; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 210; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 211; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 212; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 213; AVX512DQ-NEXT: retq 214; 215; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: 216; AVX512BW: # %bb.0: 217; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 218; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 219; AVX512BW-NEXT: retq 220 %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 221 %splat = shufflevector <32 x i16> %mod, <32 x i16> poison, <32 x i32> zeroinitializer 222 %shift = shl <32 x i16> %a, %splat 223 ret <32 x i16> %shift 224} 225 226define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 227; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: 228; AVX512DQ: # %bb.0: 229; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 230; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 231; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 232; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 233; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 234; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 235; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1 236; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 237; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 238; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 239; AVX512DQ-NEXT: retq 240; 241; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: 242; AVX512BW: # %bb.0: 243; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 244; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 245; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 246; AVX512BW-NEXT: vpsllw %xmm1, %xmm2, %xmm1 247; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 248; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 249; AVX512BW-NEXT: retq 250 %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 251 %splat = shufflevector <64 x i8> %mod, <64 x i8> poison, <64 x i32> zeroinitializer 252 %shift = shl <64 x i8> %a, %splat 253 ret <64 x i8> %shift 254} 255 256; 257; Constant Shifts 258; 259 260define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { 261; ALL-LABEL: constant_shift_v8i64: 262; ALL: # %bb.0: 263; ALL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 264; ALL-NEXT: retq 265 %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62> 266 ret <8 x i64> %shift 267} 268 269define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { 270; ALL-LABEL: constant_shift_v16i32: 271; ALL: # %bb.0: 272; ALL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 273; ALL-NEXT: retq 274 %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 275 ret <16 x i32> %shift 276} 277 278define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { 279; AVX512DQ-LABEL: constant_shift_v32i16: 280; AVX512DQ: # %bb.0: 281; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 282; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 283; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 284; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 285; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 286; AVX512DQ-NEXT: retq 287; 288; AVX512BW-LABEL: constant_shift_v32i16: 289; AVX512BW: # %bb.0: 290; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 291; AVX512BW-NEXT: retq 292 %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 293 ret <32 x i16> %shift 294} 295 296define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind { 297; AVX512DQ-LABEL: constant_shift_v32i16_pairs: 298; AVX512DQ: # %bb.0: 299; AVX512DQ-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 300; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 301; AVX512DQ-NEXT: retq 302; 303; AVX512BW-LABEL: constant_shift_v32i16_pairs: 304; AVX512BW: # %bb.0: 305; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 306; AVX512BW-NEXT: retq 307 %shift = shl <32 x i16> %a, <i16 0, i16 0, i16 1, i16 1, i16 3, i16 3, i16 2, i16 2, i16 6, i16 6, i16 7, i16 7, i16 5, i16 5, i16 4, i16 4, i16 12, i16 12, i16 13, i16 13, i16 15, i16 15, i16 14, i16 14, i16 10, i16 10, i16 11, i16 11, i16 9, i16 9, i16 8, i16 8> 308 ret <32 x i16> %shift 309} 310 311define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { 312; AVX512DQ-LABEL: constant_shift_v64i8: 313; AVX512DQ: # %bb.0: 314; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 315; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] 316; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 317; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 318; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 319; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 320; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] 321; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 322; AVX512DQ-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0 323; AVX512DQ-NEXT: vpsllw $8, %ymm0, %ymm0 324; AVX512DQ-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 325; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1 326; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 327; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) 328; AVX512DQ-NEXT: retq 329; 330; AVX512BW-LABEL: constant_shift_v64i8: 331; AVX512BW: # %bb.0: 332; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] 333; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] 334; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 335; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) 336; AVX512BW-NEXT: retq 337 %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 338 ret <64 x i8> %shift 339} 340 341define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { 342; AVX512DQ-LABEL: constant_shift_v64i8_pairs: 343; AVX512DQ: # %bb.0: 344; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1] 345; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 346; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,8,16,2,4,64,16,2,2,32,32,64,4,64,16,16] 347; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 348; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 349; AVX512DQ-NEXT: retq 350; 351; AVX512BW-LABEL: constant_shift_v64i8_pairs: 352; AVX512BW: # %bb.0: 353; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 354; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 355; AVX512BW-NEXT: retq 356 %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 7, i8 7, i8 6, i8 6, i8 2, i8 2, i8 7, i8 7, i8 0, i8 0, i8 7, i8 7, i8 1, i8 1, i8 5, i8 5, i8 0, i8 0, i8 4, i8 4, i8 7, i8 7, i8 6, i8 6, i8 1, i8 1, i8 4, i8 4, i8 0, i8 0, i8 6, i8 6, i8 3, i8 3, i8 4, i8 4, i8 1, i8 1, i8 2, i8 2, i8 6, i8 6, i8 4, i8 4, i8 1, i8 1, i8 1, i8 1, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 2, i8 2, i8 6, i8 6, i8 4, i8 4, i8 4, i8 4> 357 ret <64 x i8> %shift 358} 359 360define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind { 361; ALL-LABEL: constant_shift_v64i8_quads: 362; ALL: # %bb.0: 363; ALL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 364; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 365; ALL-NEXT: retq 366 %shift = shl <64 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 6, i8 6, i8 6, i8 6, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0> 367 ret <64 x i8> %shift 368} 369 370; 371; Uniform Constant Shifts 372; 373 374define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { 375; ALL-LABEL: splatconstant_shift_v8i64: 376; ALL: # %bb.0: 377; ALL-NEXT: vpsllq $7, %zmm0, %zmm0 378; ALL-NEXT: retq 379 %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 380 ret <8 x i64> %shift 381} 382 383define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { 384; ALL-LABEL: splatconstant_shift_v16i32: 385; ALL: # %bb.0: 386; ALL-NEXT: vpslld $5, %zmm0, %zmm0 387; ALL-NEXT: retq 388 %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 389 ret <16 x i32> %shift 390} 391 392define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { 393; AVX512DQ-LABEL: splatconstant_shift_v32i16: 394; AVX512DQ: # %bb.0: 395; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 396; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 397; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 398; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 399; AVX512DQ-NEXT: retq 400; 401; AVX512BW-LABEL: splatconstant_shift_v32i16: 402; AVX512BW: # %bb.0: 403; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 404; AVX512BW-NEXT: retq 405 %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 406 ret <32 x i16> %shift 407} 408 409define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { 410; AVX512DQ-LABEL: splatconstant_shift_v64i8: 411; AVX512DQ: # %bb.0: 412; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 413; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 414; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 415; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 416; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 417; AVX512DQ-NEXT: retq 418; 419; AVX512BW-LABEL: splatconstant_shift_v64i8: 420; AVX512BW: # %bb.0: 421; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 422; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 423; AVX512BW-NEXT: retq 424 %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 425 ret <64 x i8> %shift 426} 427