1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2 8 9declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) 10declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) 11declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>) 12declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) 13 14; 15; Variable Shifts 16; 17 18define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { 19; AVX512F-LABEL: var_funnnel_v8i64: 20; AVX512F: # %bb.0: 21; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] 22; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 23; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 24; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2 25; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 26; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 27; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 28; AVX512F-NEXT: retq 29; 30; AVX512VL-LABEL: var_funnnel_v8i64: 31; AVX512VL: # %bb.0: 32; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] 33; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 34; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 35; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2 36; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 37; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 38; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 39; AVX512VL-NEXT: retq 40; 41; AVX512BW-LABEL: var_funnnel_v8i64: 42; AVX512BW: # %bb.0: 43; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] 44; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 45; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 46; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 47; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 48; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 49; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 50; AVX512BW-NEXT: retq 51; 52; AVX512VBMI2-LABEL: var_funnnel_v8i64: 53; AVX512VBMI2: # %bb.0: 54; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 55; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 56; AVX512VBMI2-NEXT: retq 57; 58; AVX512VLBW-LABEL: var_funnnel_v8i64: 59; AVX512VLBW: # %bb.0: 60; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] 61; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 62; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 63; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 64; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 65; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 66; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 67; AVX512VLBW-NEXT: retq 68; 69; AVX512VLVBMI2-LABEL: var_funnnel_v8i64: 70; AVX512VLVBMI2: # %bb.0: 71; AVX512VLVBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 72; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 73; AVX512VLVBMI2-NEXT: retq 74 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) 75 ret <8 x i64> %res 76} 77 78define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { 79; AVX512F-LABEL: var_funnnel_v16i32: 80; AVX512F: # %bb.0: 81; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 82; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 83; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 84; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm2 85; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 86; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 87; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 88; AVX512F-NEXT: retq 89; 90; AVX512VL-LABEL: var_funnnel_v16i32: 91; AVX512VL: # %bb.0: 92; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 93; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 94; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 95; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm2 96; AVX512VL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 97; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 98; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 99; AVX512VL-NEXT: retq 100; 101; AVX512BW-LABEL: var_funnnel_v16i32: 102; AVX512BW: # %bb.0: 103; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 104; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 105; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 106; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 107; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 108; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 109; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 110; AVX512BW-NEXT: retq 111; 112; AVX512VBMI2-LABEL: var_funnnel_v16i32: 113; AVX512VBMI2: # %bb.0: 114; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 115; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 116; AVX512VBMI2-NEXT: retq 117; 118; AVX512VLBW-LABEL: var_funnnel_v16i32: 119; AVX512VLBW: # %bb.0: 120; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 121; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 122; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 123; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 124; AVX512VLBW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 125; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 126; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 127; AVX512VLBW-NEXT: retq 128; 129; AVX512VLVBMI2-LABEL: var_funnnel_v16i32: 130; AVX512VLVBMI2: # %bb.0: 131; AVX512VLVBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 132; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 133; AVX512VLVBMI2-NEXT: retq 134 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) 135 ret <16 x i32> %res 136} 137 138define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { 139; AVX512F-LABEL: var_funnnel_v32i16: 140; AVX512F: # %bb.0: 141; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 142; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 143; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 144; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 145; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 146; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 147; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 148; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 149; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 150; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 151; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 152; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 153; AVX512F-NEXT: vpslld $16, %zmm0, %zmm0 154; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 155; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 156; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 157; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 158; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 159; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 160; AVX512F-NEXT: retq 161; 162; AVX512VL-LABEL: var_funnnel_v32i16: 163; AVX512VL: # %bb.0: 164; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 165; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 166; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 167; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 168; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 169; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 170; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 171; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 172; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 173; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 174; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 175; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 176; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0 177; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 178; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm1 179; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 180; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 181; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 182; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 183; AVX512VL-NEXT: retq 184; 185; AVX512BW-LABEL: var_funnnel_v32i16: 186; AVX512BW: # %bb.0: 187; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 188; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 189; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 190; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 191; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 192; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 193; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 194; AVX512BW-NEXT: retq 195; 196; AVX512VBMI2-LABEL: var_funnnel_v32i16: 197; AVX512VBMI2: # %bb.0: 198; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 199; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 200; AVX512VBMI2-NEXT: retq 201; 202; AVX512VLBW-LABEL: var_funnnel_v32i16: 203; AVX512VLBW: # %bb.0: 204; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 205; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 206; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 207; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 208; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 209; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 210; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 211; AVX512VLBW-NEXT: retq 212; 213; AVX512VLVBMI2-LABEL: var_funnnel_v32i16: 214; AVX512VLVBMI2: # %bb.0: 215; AVX512VLVBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 216; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 217; AVX512VLVBMI2-NEXT: retq 218 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) 219 ret <32 x i16> %res 220} 221 222define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { 223; AVX512F-LABEL: var_funnnel_v64i8: 224; AVX512F: # %bb.0: 225; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 226; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 227; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 228; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 229; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 230; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm2 231; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 232; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm8 233; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 234; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 235; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 236; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 237; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 238; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 239; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7 240; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 241; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 242; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 243; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 244; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm7 245; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm6 246; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm7 247; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 248; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 249; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 250; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 251; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 252; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 253; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 254; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 255; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 256; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 257; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 258; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 259; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6 260; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 261; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 262; AVX512F-NEXT: vpxor %ymm3, %ymm4, %ymm4 263; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 264; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 265; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm6 266; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 267; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 268; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 269; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 270; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm6 271; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 272; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 273; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 274; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 275; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 276; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 277; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 278; AVX512F-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 279; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 280; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 281; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 282; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 283; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 284; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 285; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 286; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 287; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 288; AVX512F-NEXT: retq 289; 290; AVX512VL-LABEL: var_funnnel_v64i8: 291; AVX512VL: # %bb.0: 292; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 293; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3 294; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 295; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 296; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 297; AVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2 298; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 299; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm8 300; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 301; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6 302; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 303; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 304; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 305; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 306; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6 307; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 308; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 309; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 310; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 311; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 312; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 313; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6 314; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 315; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5 316; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 317; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 318; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 319; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5 320; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5 321; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 322; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 323; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 324; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 325; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 326; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 327; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 328; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 329; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm3 330; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 331; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 332; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 333; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 334; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 335; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 336; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 337; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 338; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 339; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 340; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 341; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 342; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 343; AVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm2 344; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 345; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 346; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 347; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 348; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 349; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 350; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 351; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 352; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 353; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 354; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 355; AVX512VL-NEXT: retq 356; 357; AVX512BW-LABEL: var_funnnel_v64i8: 358; AVX512BW: # %bb.0: 359; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 360; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 361; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 362; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] 363; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 364; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 365; AVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3 366; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 367; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] 368; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 369; AVX512BW-NEXT: vpandq %zmm5, %zmm0, %zmm0 370; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 371; AVX512BW-NEXT: retq 372; 373; AVX512VBMI2-LABEL: var_funnnel_v64i8: 374; AVX512VBMI2: # %bb.0: 375; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 376; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 377; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 378; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] 379; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 380; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 381; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] 382; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 383; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 384; AVX512VBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 385; AVX512VBMI2-NEXT: retq 386; 387; AVX512VLBW-LABEL: var_funnnel_v64i8: 388; AVX512VLBW: # %bb.0: 389; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 390; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 391; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 392; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] 393; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 394; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 395; AVX512VLBW-NEXT: vpandq %zmm5, %zmm3, %zmm3 396; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 397; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] 398; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 399; AVX512VLBW-NEXT: vpandq %zmm5, %zmm0, %zmm0 400; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 401; AVX512VLBW-NEXT: retq 402; 403; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: 404; AVX512VLVBMI2: # %bb.0: 405; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 406; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 407; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 408; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] 409; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 410; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 411; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] 412; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 413; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 414; AVX512VLVBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 415; AVX512VLVBMI2-NEXT: retq 416 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) 417 ret <64 x i8> %res 418} 419 420; 421; Uniform Variable Shifts 422; 423 424define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { 425; AVX512F-LABEL: splatvar_funnnel_v8i64: 426; AVX512F: # %bb.0: 427; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 428; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 429; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 430; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 431; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 432; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0 433; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 434; AVX512F-NEXT: retq 435; 436; AVX512VL-LABEL: splatvar_funnnel_v8i64: 437; AVX512VL: # %bb.0: 438; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 439; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 440; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 441; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 442; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 443; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0 444; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 445; AVX512VL-NEXT: retq 446; 447; AVX512BW-LABEL: splatvar_funnnel_v8i64: 448; AVX512BW: # %bb.0: 449; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 450; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 451; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 452; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 453; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 454; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 455; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 456; AVX512BW-NEXT: retq 457; 458; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64: 459; AVX512VBMI2: # %bb.0: 460; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %zmm2 461; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 462; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 463; AVX512VBMI2-NEXT: retq 464; 465; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: 466; AVX512VLBW: # %bb.0: 467; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 468; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 469; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 470; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 471; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 472; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 473; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 474; AVX512VLBW-NEXT: retq 475; 476; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64: 477; AVX512VLVBMI2: # %bb.0: 478; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %zmm2 479; AVX512VLVBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 480; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 481; AVX512VLVBMI2-NEXT: retq 482 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer 483 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat) 484 ret <8 x i64> %res 485} 486 487define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { 488; AVX512F-LABEL: splatvar_funnnel_v16i32: 489; AVX512F: # %bb.0: 490; AVX512F-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 491; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 492; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 493; AVX512F-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] 494; AVX512F-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 495; AVX512F-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14] 496; AVX512F-NEXT: retq 497; 498; AVX512VL-LABEL: splatvar_funnnel_v16i32: 499; AVX512VL: # %bb.0: 500; AVX512VL-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 501; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 502; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 503; AVX512VL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] 504; AVX512VL-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 505; AVX512VL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14] 506; AVX512VL-NEXT: retq 507; 508; AVX512BW-LABEL: splatvar_funnnel_v16i32: 509; AVX512BW: # %bb.0: 510; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 511; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 512; AVX512BW-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 513; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] 514; AVX512BW-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 515; AVX512BW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14] 516; AVX512BW-NEXT: retq 517; 518; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32: 519; AVX512VBMI2: # %bb.0: 520; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %zmm2 521; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 522; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 523; AVX512VBMI2-NEXT: retq 524; 525; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: 526; AVX512VLBW: # %bb.0: 527; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 528; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 529; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 530; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] 531; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 532; AVX512VLBW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14] 533; AVX512VLBW-NEXT: retq 534; 535; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32: 536; AVX512VLVBMI2: # %bb.0: 537; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %zmm2 538; AVX512VLVBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 539; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 540; AVX512VLVBMI2-NEXT: retq 541 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer 542 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat) 543 ret <16 x i32> %res 544} 545 546define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { 547; AVX512F-LABEL: splatvar_funnnel_v32i16: 548; AVX512F: # %bb.0: 549; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] 550; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 551; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 552; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 553; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 554; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 555; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 556; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 557; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 558; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 559; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 560; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 561; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 562; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 563; AVX512F-NEXT: retq 564; 565; AVX512VL-LABEL: splatvar_funnnel_v32i16: 566; AVX512VL: # %bb.0: 567; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] 568; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 569; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 570; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 571; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 572; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 573; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 574; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 575; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3 576; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 577; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 578; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 579; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 580; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 581; AVX512VL-NEXT: retq 582; 583; AVX512BW-LABEL: splatvar_funnnel_v32i16: 584; AVX512BW: # %bb.0: 585; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] 586; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 587; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 588; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 589; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 590; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 591; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 592; AVX512BW-NEXT: retq 593; 594; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16: 595; AVX512VBMI2: # %bb.0: 596; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %zmm2 597; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 598; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 599; AVX512VBMI2-NEXT: retq 600; 601; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: 602; AVX512VLBW: # %bb.0: 603; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] 604; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 605; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 606; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 607; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 608; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 609; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 610; AVX512VLBW-NEXT: retq 611; 612; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16: 613; AVX512VLVBMI2: # %bb.0: 614; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %zmm2 615; AVX512VLVBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 616; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 617; AVX512VLVBMI2-NEXT: retq 618 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer 619 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat) 620 ret <32 x i16> %res 621} 622 623define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { 624; AVX512F-LABEL: splatvar_funnnel_v64i8: 625; AVX512F: # %bb.0: 626; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 627; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 628; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] 629; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 630; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 631; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 632; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 633; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] 634; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 635; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 636; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 637; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 638; AVX512F-NEXT: vpsrlw %xmm2, %ymm4, %ymm4 639; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 640; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 641; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 642; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 643; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 644; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 645; AVX512F-NEXT: retq 646; 647; AVX512VL-LABEL: splatvar_funnnel_v64i8: 648; AVX512VL: # %bb.0: 649; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 650; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 651; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] 652; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 653; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 654; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 655; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 656; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] 657; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 658; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 659; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 660; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 661; AVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4 662; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 663; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] 664; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 665; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 666; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 667; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 668; AVX512VL-NEXT: retq 669; 670; AVX512BW-LABEL: splatvar_funnnel_v64i8: 671; AVX512BW: # %bb.0: 672; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 673; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 674; AVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 675; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 676; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 677; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 678; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 679; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 680; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 681; AVX512BW-NEXT: retq 682; 683; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: 684; AVX512VBMI2: # %bb.0: 685; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 686; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 687; AVX512VBMI2-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 688; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 689; AVX512VBMI2-NEXT: vpsrlw %xmm2, %zmm0, %zmm1 690; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 691; AVX512VBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 692; AVX512VBMI2-NEXT: retq 693; 694; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: 695; AVX512VLBW: # %bb.0: 696; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 697; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 698; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 699; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 700; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 701; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 702; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 703; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 704; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 705; AVX512VLBW-NEXT: retq 706; 707; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: 708; AVX512VLVBMI2: # %bb.0: 709; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 710; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 711; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 712; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 713; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %zmm0, %zmm1 714; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 715; AVX512VLVBMI2-NEXT: vpermi2b %zmm3, %zmm1, %zmm0 716; AVX512VLVBMI2-NEXT: retq 717 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer 718 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat) 719 ret <64 x i8> %res 720} 721 722; 723; Constant Shifts 724; 725 726define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { 727; AVX512F-LABEL: constant_funnnel_v8i64: 728; AVX512F: # %bb.0: 729; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 730; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 731; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 732; AVX512F-NEXT: retq 733; 734; AVX512VL-LABEL: constant_funnnel_v8i64: 735; AVX512VL: # %bb.0: 736; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 737; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 738; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 739; AVX512VL-NEXT: retq 740; 741; AVX512BW-LABEL: constant_funnnel_v8i64: 742; AVX512BW: # %bb.0: 743; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 744; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 745; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 746; AVX512BW-NEXT: retq 747; 748; AVX512VBMI2-LABEL: constant_funnnel_v8i64: 749; AVX512VBMI2: # %bb.0: 750; AVX512VBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 751; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 752; AVX512VBMI2-NEXT: retq 753; 754; AVX512VLBW-LABEL: constant_funnnel_v8i64: 755; AVX512VLBW: # %bb.0: 756; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 757; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 758; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 759; AVX512VLBW-NEXT: retq 760; 761; AVX512VLVBMI2-LABEL: constant_funnnel_v8i64: 762; AVX512VLVBMI2: # %bb.0: 763; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 764; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 765; AVX512VLVBMI2-NEXT: retq 766 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>) 767 ret <8 x i64> %res 768} 769 770define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { 771; AVX512F-LABEL: constant_funnnel_v16i32: 772; AVX512F: # %bb.0: 773; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 774; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 775; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 776; AVX512F-NEXT: retq 777; 778; AVX512VL-LABEL: constant_funnnel_v16i32: 779; AVX512VL: # %bb.0: 780; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 781; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 782; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 783; AVX512VL-NEXT: retq 784; 785; AVX512BW-LABEL: constant_funnnel_v16i32: 786; AVX512BW: # %bb.0: 787; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 788; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 789; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 790; AVX512BW-NEXT: retq 791; 792; AVX512VBMI2-LABEL: constant_funnnel_v16i32: 793; AVX512VBMI2: # %bb.0: 794; AVX512VBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 795; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 796; AVX512VBMI2-NEXT: retq 797; 798; AVX512VLBW-LABEL: constant_funnnel_v16i32: 799; AVX512VLBW: # %bb.0: 800; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 801; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 802; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 803; AVX512VLBW-NEXT: retq 804; 805; AVX512VLVBMI2-LABEL: constant_funnnel_v16i32: 806; AVX512VLVBMI2: # %bb.0: 807; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 808; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 809; AVX512VLVBMI2-NEXT: retq 810 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>) 811 ret <16 x i32> %res 812} 813 814define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { 815; AVX512F-LABEL: constant_funnnel_v32i16: 816; AVX512F: # %bb.0: 817; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 818; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 819; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4 820; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7] 821; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 822; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3 823; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] 824; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 825; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 826; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 827; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 828; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 829; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 830; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 831; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 832; AVX512F-NEXT: retq 833; 834; AVX512VL-LABEL: constant_funnnel_v32i16: 835; AVX512VL: # %bb.0: 836; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 837; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 838; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4 839; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7] 840; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 841; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm3 842; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] 843; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 844; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 845; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 846; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] 847; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 848; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 849; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 850; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 851; AVX512VL-NEXT: retq 852; 853; AVX512BW-LABEL: constant_funnnel_v32i16: 854; AVX512BW: # %bb.0: 855; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 856; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 857; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 858; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 859; AVX512BW-NEXT: retq 860; 861; AVX512VBMI2-LABEL: constant_funnnel_v32i16: 862; AVX512VBMI2: # %bb.0: 863; AVX512VBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 864; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 865; AVX512VBMI2-NEXT: retq 866; 867; AVX512VLBW-LABEL: constant_funnnel_v32i16: 868; AVX512VLBW: # %bb.0: 869; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 870; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 871; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 872; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 873; AVX512VLBW-NEXT: retq 874; 875; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16: 876; AVX512VLVBMI2: # %bb.0: 877; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 878; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 879; AVX512VLVBMI2-NEXT: retq 880 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>) 881 ret <32 x i16> %res 882} 883 884define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { 885; AVX512F-LABEL: constant_funnnel_v64i8: 886; AVX512F: # %bb.0: 887; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 888; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 889; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 890; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] 891; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] 892; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 893; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 894; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 895; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 896; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] 897; AVX512F-NEXT: vpmullw %ymm6, %ymm2, %ymm2 898; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 899; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 900; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 901; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 902; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 903; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 904; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 905; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 906; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 907; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 908; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 909; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] 910; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 911; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm2, %ymm4 912; AVX512F-NEXT: vpsllw $8, %ymm4, %ymm4 913; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 914; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 915; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 916; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3 917; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 918; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] 919; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 920; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 921; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 922; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 923; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 924; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 925; AVX512F-NEXT: retq 926; 927; AVX512VL-LABEL: constant_funnnel_v64i8: 928; AVX512VL: # %bb.0: 929; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 930; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 931; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 932; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] 933; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] 934; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 935; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 936; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 937; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 938; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] 939; AVX512VL-NEXT: vpmullw %ymm6, %ymm2, %ymm2 940; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 941; AVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 942; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 943; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 944; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 945; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 946; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1 947; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 948; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 949; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 950; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 951; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] 952; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] 953; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm2, %ymm4 954; AVX512VL-NEXT: vpsllw $8, %ymm4, %ymm4 955; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 956; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 957; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 958; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 959; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 960; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] 961; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] 962; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 963; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 964; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 965; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 966; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 967; AVX512VL-NEXT: retq 968; 969; AVX512BW-LABEL: constant_funnnel_v64i8: 970; AVX512BW: # %bb.0: 971; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 972; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 973; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 974; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 975; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 976; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 977; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 978; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 979; AVX512BW-NEXT: retq 980; 981; AVX512VBMI2-LABEL: constant_funnnel_v64i8: 982; AVX512VBMI2: # %bb.0: 983; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 984; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 985; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 986; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 987; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 988; AVX512VBMI2-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 989; AVX512VBMI2-NEXT: retq 990; 991; AVX512VLBW-LABEL: constant_funnnel_v64i8: 992; AVX512VLBW: # %bb.0: 993; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 994; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 995; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 996; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 997; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 998; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 999; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm0 1000; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 1001; AVX512VLBW-NEXT: retq 1002; 1003; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: 1004; AVX512VLVBMI2: # %bb.0: 1005; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] 1006; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 1007; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] 1008; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 1009; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] 1010; AVX512VLVBMI2-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 1011; AVX512VLVBMI2-NEXT: retq 1012 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 1013 ret <64 x i8> %res 1014} 1015 1016; 1017; Uniform Constant Shifts 1018; 1019 1020define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { 1021; AVX512F-LABEL: splatconstant_funnnel_v8i64: 1022; AVX512F: # %bb.0: 1023; AVX512F-NEXT: vpsrlq $14, %zmm1, %zmm1 1024; AVX512F-NEXT: vpsllq $50, %zmm0, %zmm0 1025; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 1026; AVX512F-NEXT: retq 1027; 1028; AVX512VL-LABEL: splatconstant_funnnel_v8i64: 1029; AVX512VL: # %bb.0: 1030; AVX512VL-NEXT: vpsrlq $14, %zmm1, %zmm1 1031; AVX512VL-NEXT: vpsllq $50, %zmm0, %zmm0 1032; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 1033; AVX512VL-NEXT: retq 1034; 1035; AVX512BW-LABEL: splatconstant_funnnel_v8i64: 1036; AVX512BW: # %bb.0: 1037; AVX512BW-NEXT: vpsrlq $14, %zmm1, %zmm1 1038; AVX512BW-NEXT: vpsllq $50, %zmm0, %zmm0 1039; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 1040; AVX512BW-NEXT: retq 1041; 1042; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i64: 1043; AVX512VBMI2: # %bb.0: 1044; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0 1045; AVX512VBMI2-NEXT: retq 1046; 1047; AVX512VLBW-LABEL: splatconstant_funnnel_v8i64: 1048; AVX512VLBW: # %bb.0: 1049; AVX512VLBW-NEXT: vpsrlq $14, %zmm1, %zmm1 1050; AVX512VLBW-NEXT: vpsllq $50, %zmm0, %zmm0 1051; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 1052; AVX512VLBW-NEXT: retq 1053; 1054; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i64: 1055; AVX512VLVBMI2: # %bb.0: 1056; AVX512VLVBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0 1057; AVX512VLVBMI2-NEXT: retq 1058 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>) 1059 ret <8 x i64> %res 1060} 1061 1062define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { 1063; AVX512F-LABEL: splatconstant_funnnel_v16i32: 1064; AVX512F: # %bb.0: 1065; AVX512F-NEXT: vpsrld $4, %zmm1, %zmm1 1066; AVX512F-NEXT: vpslld $28, %zmm0, %zmm0 1067; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 1068; AVX512F-NEXT: retq 1069; 1070; AVX512VL-LABEL: splatconstant_funnnel_v16i32: 1071; AVX512VL: # %bb.0: 1072; AVX512VL-NEXT: vpsrld $4, %zmm1, %zmm1 1073; AVX512VL-NEXT: vpslld $28, %zmm0, %zmm0 1074; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 1075; AVX512VL-NEXT: retq 1076; 1077; AVX512BW-LABEL: splatconstant_funnnel_v16i32: 1078; AVX512BW: # %bb.0: 1079; AVX512BW-NEXT: vpsrld $4, %zmm1, %zmm1 1080; AVX512BW-NEXT: vpslld $28, %zmm0, %zmm0 1081; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 1082; AVX512BW-NEXT: retq 1083; 1084; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i32: 1085; AVX512VBMI2: # %bb.0: 1086; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0 1087; AVX512VBMI2-NEXT: retq 1088; 1089; AVX512VLBW-LABEL: splatconstant_funnnel_v16i32: 1090; AVX512VLBW: # %bb.0: 1091; AVX512VLBW-NEXT: vpsrld $4, %zmm1, %zmm1 1092; AVX512VLBW-NEXT: vpslld $28, %zmm0, %zmm0 1093; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 1094; AVX512VLBW-NEXT: retq 1095; 1096; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i32: 1097; AVX512VLVBMI2: # %bb.0: 1098; AVX512VLVBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0 1099; AVX512VLVBMI2-NEXT: retq 1100 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>) 1101 ret <16 x i32> %res 1102} 1103 1104define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { 1105; AVX512F-LABEL: splatconstant_funnnel_v32i16: 1106; AVX512F: # %bb.0: 1107; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 1108; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1109; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 1110; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 1111; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm2 1112; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1113; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 1114; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 1115; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 1116; AVX512F-NEXT: retq 1117; 1118; AVX512VL-LABEL: splatconstant_funnnel_v32i16: 1119; AVX512VL: # %bb.0: 1120; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 1121; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1122; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1 1123; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 1124; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm2 1125; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1126; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 1127; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 1128; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 1129; AVX512VL-NEXT: retq 1130; 1131; AVX512BW-LABEL: splatconstant_funnnel_v32i16: 1132; AVX512BW: # %bb.0: 1133; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 1134; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0 1135; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 1136; AVX512BW-NEXT: retq 1137; 1138; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16: 1139; AVX512VBMI2: # %bb.0: 1140; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0 1141; AVX512VBMI2-NEXT: retq 1142; 1143; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16: 1144; AVX512VLBW: # %bb.0: 1145; AVX512VLBW-NEXT: vpsrlw $7, %zmm1, %zmm1 1146; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0 1147; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 1148; AVX512VLBW-NEXT: retq 1149; 1150; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16: 1151; AVX512VLVBMI2: # %bb.0: 1152; AVX512VLVBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0 1153; AVX512VLVBMI2-NEXT: retq 1154 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 1155 ret <32 x i16> %res 1156} 1157 1158define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { 1159; AVX512F-LABEL: splatconstant_funnnel_v64i8: 1160; AVX512F: # %bb.0: 1161; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 1162; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1163; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 1164; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 1165; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0 1166; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1167; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1168; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1169; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1170; AVX512F-NEXT: retq 1171; 1172; AVX512VL-LABEL: splatconstant_funnnel_v64i8: 1173; AVX512VL: # %bb.0: 1174; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 1175; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1176; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 1177; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 1178; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 1179; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1180; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 1181; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1182; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1183; AVX512VL-NEXT: retq 1184; 1185; AVX512BW-LABEL: splatconstant_funnnel_v64i8: 1186; AVX512BW: # %bb.0: 1187; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 1188; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0 1189; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1190; AVX512BW-NEXT: retq 1191; 1192; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: 1193; AVX512VBMI2: # %bb.0: 1194; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 1195; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 1196; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1197; AVX512VBMI2-NEXT: retq 1198; 1199; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: 1200; AVX512VLBW: # %bb.0: 1201; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 1202; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0 1203; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1204; AVX512VLBW-NEXT: retq 1205; 1206; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: 1207; AVX512VLVBMI2: # %bb.0: 1208; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 1209; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 1210; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1211; AVX512VLVBMI2-NEXT: retq 1212 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 1213 ret <64 x i8> %res 1214} 1215