1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 8 9declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) 10declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) 11declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>) 12declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) 13 14; 15; Variable Shifts 16; 17 18define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind { 19; AVX512-LABEL: var_funnnel_v8i64: 20; AVX512: # %bb.0: 21; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0 22; AVX512-NEXT: retq 23 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt) 24 ret <8 x i64> %res 25} 26 27define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind { 28; AVX512-LABEL: var_funnnel_v16i32: 29; AVX512: # %bb.0: 30; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0 31; AVX512-NEXT: retq 32 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt) 33 ret <16 x i32> %res 34} 35 36define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { 37; AVX512F-LABEL: var_funnnel_v32i16: 38; AVX512F: # %bb.0: 39; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 40; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 41; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 42; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 43; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 44; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 45; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 46; AVX512F-NEXT: vpsrlvd %ymm5, %ymm7, %ymm5 47; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 48; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 49; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 50; AVX512F-NEXT: vpsrlvd %ymm2, %ymm6, %ymm2 51; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] 52; AVX512F-NEXT: vpackusdw %ymm5, %ymm2, %ymm2 53; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 54; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] 55; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 56; AVX512F-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3 57; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] 58; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] 59; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 60; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 61; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] 62; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 63; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 64; AVX512F-NEXT: retq 65; 66; AVX512VL-LABEL: var_funnnel_v32i16: 67; AVX512VL: # %bb.0: 68; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 69; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 70; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 71; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 72; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 73; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm6 74; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 75; AVX512VL-NEXT: vpsrlvd %ymm5, %ymm7, %ymm5 76; AVX512VL-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 77; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 78; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 79; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm6, %ymm2 80; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] 81; AVX512VL-NEXT: vpackusdw %ymm5, %ymm2, %ymm2 82; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 83; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] 84; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 85; AVX512VL-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3 86; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] 87; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] 88; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 89; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 90; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] 91; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 92; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 93; AVX512VL-NEXT: retq 94; 95; AVX512BW-LABEL: var_funnnel_v32i16: 96; AVX512BW: # %bb.0: 97; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 98; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 99; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 100; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 101; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 102; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 103; AVX512BW-NEXT: retq 104; 105; AVX512VLBW-LABEL: var_funnnel_v32i16: 106; AVX512VLBW: # %bb.0: 107; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 108; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 109; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 110; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 111; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 112; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 113; AVX512VLBW-NEXT: retq 114; 115; AVX512VBMI2-LABEL: var_funnnel_v32i16: 116; AVX512VBMI2: # %bb.0: 117; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 118; AVX512VBMI2-NEXT: retq 119; 120; AVX512VLVBMI2-LABEL: var_funnnel_v32i16: 121; AVX512VLVBMI2: # %bb.0: 122; AVX512VLVBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 123; AVX512VLVBMI2-NEXT: retq 124 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt) 125 ret <32 x i16> %res 126} 127 128define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { 129; AVX512F-LABEL: var_funnnel_v64i8: 130; AVX512F: # %bb.0: 131; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 132; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 133; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4 134; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] 135; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 136; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 137; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 138; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 139; AVX512F-NEXT: vpsllw $6, %ymm2, %ymm4 140; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm6 141; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] 142; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4)) 143; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 144; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 145; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm4 146; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm6 147; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] 148; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm8 & (zmm6 ^ zmm4)) 149; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 150; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 151; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 152; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 153; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 154; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 155; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 156; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3 157; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4 158; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3)) 159; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 160; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 161; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 162; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 163; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm8 & (zmm4 ^ zmm3)) 164; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 165; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 166; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 167; AVX512F-NEXT: retq 168; 169; AVX512VL-LABEL: var_funnnel_v64i8: 170; AVX512VL: # %bb.0: 171; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 172; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 173; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4 174; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] 175; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) 176; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 177; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 178; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 179; AVX512VL-NEXT: vpsllw $6, %ymm2, %ymm4 180; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm6 181; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] 182; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4)) 183; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 184; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 185; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm4 186; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm6 187; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] 188; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm8 & (ymm6 ^ ymm4)) 189; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 190; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 191; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 192; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 193; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) 194; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 195; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 196; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3 197; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 198; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 199; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 200; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 201; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 202; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 203; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm8 & (ymm4 ^ ymm3)) 204; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 205; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 206; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 207; AVX512VL-NEXT: retq 208; 209; AVX512BW-LABEL: var_funnnel_v64i8: 210; AVX512BW: # %bb.0: 211; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 212; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 213; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 214; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 215; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 216; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 217; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 218; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 219; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 220; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 221; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 222; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 223; AVX512BW-NEXT: retq 224; 225; AVX512VLBW-LABEL: var_funnnel_v64i8: 226; AVX512VLBW: # %bb.0: 227; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 228; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 229; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 230; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 231; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 232; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 233; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 234; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 235; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 236; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 237; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 238; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 239; AVX512VLBW-NEXT: retq 240; 241; AVX512VBMI2-LABEL: var_funnnel_v64i8: 242; AVX512VBMI2: # %bb.0: 243; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 244; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 245; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 246; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 247; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 248; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 249; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 250; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 251; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 252; AVX512VBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 253; AVX512VBMI2-NEXT: retq 254; 255; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: 256; AVX512VLVBMI2: # %bb.0: 257; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 258; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 259; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 260; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 261; AVX512VLVBMI2-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 262; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 263; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 264; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 265; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 266; AVX512VLVBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 267; AVX512VLVBMI2-NEXT: retq 268 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt) 269 ret <64 x i8> %res 270} 271 272; 273; Uniform Variable Shifts 274; 275 276define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind { 277; AVX512-LABEL: splatvar_funnnel_v8i64: 278; AVX512: # %bb.0: 279; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1 280; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0 281; AVX512-NEXT: retq 282 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer 283 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat) 284 ret <8 x i64> %res 285} 286 287define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind { 288; AVX512-LABEL: splatvar_funnnel_v16i32: 289; AVX512: # %bb.0: 290; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1 291; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0 292; AVX512-NEXT: retq 293 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer 294 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat) 295 ret <16 x i32> %res 296} 297 298define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { 299; AVX512F-LABEL: splatvar_funnnel_v32i16: 300; AVX512F: # %bb.0: 301; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 302; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 303; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 304; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 305; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 306; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 307; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 308; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm2 309; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 310; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 311; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 312; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 313; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 314; AVX512F-NEXT: retq 315; 316; AVX512VL-LABEL: splatvar_funnnel_v32i16: 317; AVX512VL: # %bb.0: 318; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 319; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 320; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 321; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 322; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 323; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 324; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 325; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm2 326; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 327; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 328; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 329; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 330; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 331; AVX512VL-NEXT: retq 332; 333; AVX512BW-LABEL: splatvar_funnnel_v32i16: 334; AVX512BW: # %bb.0: 335; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 336; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 337; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 338; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 339; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 340; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 341; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 342; AVX512BW-NEXT: retq 343; 344; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: 345; AVX512VLBW: # %bb.0: 346; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 347; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 348; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 349; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 350; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 351; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 352; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 353; AVX512VLBW-NEXT: retq 354; 355; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16: 356; AVX512VBMI2: # %bb.0: 357; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 358; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 359; AVX512VBMI2-NEXT: retq 360; 361; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16: 362; AVX512VLVBMI2: # %bb.0: 363; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 364; AVX512VLVBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 365; AVX512VLVBMI2-NEXT: retq 366 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer 367 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat) 368 ret <32 x i16> %res 369} 370 371define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { 372; AVX512F-LABEL: splatvar_funnnel_v64i8: 373; AVX512F: # %bb.0: 374; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 375; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 376; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 377; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 378; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 379; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 380; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 381; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 382; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 383; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 384; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 385; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 386; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 387; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 388; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 389; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 390; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 391; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 392; AVX512F-NEXT: retq 393; 394; AVX512VL-LABEL: splatvar_funnnel_v64i8: 395; AVX512VL: # %bb.0: 396; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 397; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 398; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 399; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 400; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 401; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 402; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 403; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 404; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 405; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 406; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 407; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 408; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 409; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 410; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 411; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 412; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 413; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 414; AVX512VL-NEXT: retq 415; 416; AVX512BW-LABEL: splatvar_funnnel_v64i8: 417; AVX512BW: # %bb.0: 418; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 419; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 420; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 421; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 422; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 423; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 424; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 425; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 426; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 427; AVX512BW-NEXT: retq 428; 429; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: 430; AVX512VLBW: # %bb.0: 431; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 432; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 433; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 434; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 435; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 436; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 437; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 438; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm0 439; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 440; AVX512VLBW-NEXT: retq 441; 442; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: 443; AVX512VBMI2: # %bb.0: 444; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 445; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 446; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 447; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 448; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm1 449; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 450; AVX512VBMI2-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 451; AVX512VBMI2-NEXT: retq 452; 453; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: 454; AVX512VLVBMI2: # %bb.0: 455; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 456; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 457; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 458; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 459; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm1 460; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 461; AVX512VLVBMI2-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 462; AVX512VLVBMI2-NEXT: retq 463 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer 464 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) 465 ret <64 x i8> %res 466} 467 468; 469; Constant Shifts 470; 471 472define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind { 473; AVX512-LABEL: constant_funnnel_v8i64: 474; AVX512: # %bb.0: 475; AVX512-NEXT: vprorvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 476; AVX512-NEXT: retq 477 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>) 478 ret <8 x i64> %res 479} 480 481define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind { 482; AVX512-LABEL: constant_funnnel_v16i32: 483; AVX512: # %bb.0: 484; AVX512-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 485; AVX512-NEXT: retq 486 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>) 487 ret <16 x i32> %res 488} 489 490define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { 491; AVX512F-LABEL: constant_funnnel_v32i16: 492; AVX512F: # %bb.0: 493; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 494; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 495; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 496; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 497; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 498; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 499; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 500; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 501; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 502; AVX512F-NEXT: retq 503; 504; AVX512VL-LABEL: constant_funnnel_v32i16: 505; AVX512VL: # %bb.0: 506; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 507; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 508; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 509; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 510; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 511; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 512; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 513; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 514; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 515; AVX512VL-NEXT: retq 516; 517; AVX512BW-LABEL: constant_funnnel_v32i16: 518; AVX512BW: # %bb.0: 519; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 520; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 521; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 522; AVX512BW-NEXT: retq 523; 524; AVX512VLBW-LABEL: constant_funnnel_v32i16: 525; AVX512VLBW: # %bb.0: 526; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 527; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 528; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 529; AVX512VLBW-NEXT: retq 530; 531; AVX512VBMI2-LABEL: constant_funnnel_v32i16: 532; AVX512VBMI2: # %bb.0: 533; AVX512VBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 534; AVX512VBMI2-NEXT: retq 535; 536; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16: 537; AVX512VLVBMI2: # %bb.0: 538; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 539; AVX512VLVBMI2-NEXT: retq 540 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>) 541 ret <32 x i16> %res 542} 543 544define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { 545; AVX512F-LABEL: constant_funnnel_v64i8: 546; AVX512F: # %bb.0: 547; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 548; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 549; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 550; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 551; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 552; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 553; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 554; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 555; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 556; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 557; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 558; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 559; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 560; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 561; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 562; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 563; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 564; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 565; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 566; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 567; AVX512F-NEXT: retq 568; 569; AVX512VL-LABEL: constant_funnnel_v64i8: 570; AVX512VL: # %bb.0: 571; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 572; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 573; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 574; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] 575; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 576; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 577; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 578; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 579; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] 580; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 581; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 582; AVX512VL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 583; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 584; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 585; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 586; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 587; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 588; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 589; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 590; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 591; AVX512VL-NEXT: retq 592; 593; AVX512BW-LABEL: constant_funnnel_v64i8: 594; AVX512BW: # %bb.0: 595; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 596; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 597; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 598; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 599; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 600; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 601; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 602; AVX512BW-NEXT: retq 603; 604; AVX512VLBW-LABEL: constant_funnnel_v64i8: 605; AVX512VLBW: # %bb.0: 606; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 607; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 608; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 609; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 610; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 611; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 612; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 613; AVX512VLBW-NEXT: retq 614; 615; AVX512VBMI2-LABEL: constant_funnnel_v64i8: 616; AVX512VBMI2: # %bb.0: 617; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 618; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 619; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 620; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 621; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 622; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 623; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 624; AVX512VBMI2-NEXT: retq 625; 626; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: 627; AVX512VLVBMI2: # %bb.0: 628; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 629; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 630; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 631; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 632; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 633; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 634; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 635; AVX512VLVBMI2-NEXT: retq 636 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 637 ret <64 x i8> %res 638} 639 640; 641; Uniform Constant Shifts 642; 643 644define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind { 645; AVX512-LABEL: splatconstant_funnnel_v8i64: 646; AVX512: # %bb.0: 647; AVX512-NEXT: vprorq $14, %zmm0, %zmm0 648; AVX512-NEXT: retq 649 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>) 650 ret <8 x i64> %res 651} 652 653define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind { 654; AVX512-LABEL: splatconstant_funnnel_v16i32: 655; AVX512: # %bb.0: 656; AVX512-NEXT: vprord $4, %zmm0, %zmm0 657; AVX512-NEXT: retq 658 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>) 659 ret <16 x i32> %res 660} 661 662define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { 663; AVX512F-LABEL: splatconstant_funnnel_v32i16: 664; AVX512F: # %bb.0: 665; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1 666; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 667; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3 668; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 669; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 670; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2 671; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 672; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 673; AVX512F-NEXT: retq 674; 675; AVX512VL-LABEL: splatconstant_funnnel_v32i16: 676; AVX512VL: # %bb.0: 677; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm1 678; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 679; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm3 680; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 681; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 682; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2 683; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 684; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 685; AVX512VL-NEXT: retq 686; 687; AVX512BW-LABEL: splatconstant_funnnel_v32i16: 688; AVX512BW: # %bb.0: 689; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 690; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0 691; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 692; AVX512BW-NEXT: retq 693; 694; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16: 695; AVX512VLBW: # %bb.0: 696; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm1 697; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0 698; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 699; AVX512VLBW-NEXT: retq 700; 701; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16: 702; AVX512VBMI2: # %bb.0: 703; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm0, %zmm0 704; AVX512VBMI2-NEXT: retq 705; 706; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16: 707; AVX512VLVBMI2: # %bb.0: 708; AVX512VLVBMI2-NEXT: vpshrdw $7, %zmm0, %zmm0, %zmm0 709; AVX512VLVBMI2-NEXT: retq 710 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 711 ret <32 x i16> %res 712} 713 714define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { 715; AVX512F-LABEL: splatconstant_funnnel_v64i8: 716; AVX512F: # %bb.0: 717; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 718; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 719; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 720; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 721; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 722; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 723; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 724; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 725; AVX512F-NEXT: retq 726; 727; AVX512VL-LABEL: splatconstant_funnnel_v64i8: 728; AVX512VL: # %bb.0: 729; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 730; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 731; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 732; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 733; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 734; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 735; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 736; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 737; AVX512VL-NEXT: retq 738; 739; AVX512BW-LABEL: splatconstant_funnnel_v64i8: 740; AVX512BW: # %bb.0: 741; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 742; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 743; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 744; AVX512BW-NEXT: retq 745; 746; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: 747; AVX512VLBW: # %bb.0: 748; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 749; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 750; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 751; AVX512VLBW-NEXT: retq 752; 753; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: 754; AVX512VBMI2: # %bb.0: 755; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 756; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 757; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 758; AVX512VBMI2-NEXT: retq 759; 760; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: 761; AVX512VLVBMI2: # %bb.0: 762; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 763; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 764; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 765; AVX512VLVBMI2-NEXT: retq 766 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 767 ret <64 x i8> %res 768} 769