1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 8 9; 10; Variable Rotates 11; 12 13define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 14; AVX512-LABEL: var_rotate_v8i64: 15; AVX512: # %bb.0: 16; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 17; AVX512-NEXT: retq 18 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b 19 %shl = shl <8 x i64> %a, %b 20 %lshr = lshr <8 x i64> %a, %b64 21 %or = or <8 x i64> %shl, %lshr 22 ret <8 x i64> %or 23} 24 25define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 26; AVX512-LABEL: var_rotate_v16i32: 27; AVX512: # %bb.0: 28; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 29; AVX512-NEXT: retq 30 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b 31 %shl = shl <16 x i32> %a, %b 32 %lshr = lshr <16 x i32> %a, %b32 33 %or = or <16 x i32> %shl, %lshr 34 ret <16 x i32> %or 35} 36 37define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 38; AVX512F-LABEL: var_rotate_v32i16: 39; AVX512F: # %bb.0: 40; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 41; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 42; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 43; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 44; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 45; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 46; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 47; AVX512F-NEXT: vpsllvd %ymm5, %ymm7, %ymm5 48; AVX512F-NEXT: vpsrld $16, %ymm5, %ymm5 49; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 50; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 51; AVX512F-NEXT: vpsllvd %ymm2, %ymm6, %ymm2 52; AVX512F-NEXT: vpsrld $16, %ymm2, %ymm2 53; AVX512F-NEXT: vpackusdw %ymm5, %ymm2, %ymm2 54; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 55; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] 56; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 57; AVX512F-NEXT: vpsllvd %ymm3, %ymm5, %ymm3 58; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3 59; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] 60; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 61; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 62; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 63; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 64; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 65; AVX512F-NEXT: retq 66; 67; AVX512VL-LABEL: var_rotate_v32i16: 68; AVX512VL: # %bb.0: 69; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 70; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 72; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 73; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 74; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm6 75; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 76; AVX512VL-NEXT: vpsllvd %ymm5, %ymm7, %ymm5 77; AVX512VL-NEXT: vpsrld $16, %ymm5, %ymm5 78; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 79; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 80; AVX512VL-NEXT: vpsllvd %ymm2, %ymm6, %ymm2 81; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 82; AVX512VL-NEXT: vpackusdw %ymm5, %ymm2, %ymm2 83; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 84; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] 85; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 86; AVX512VL-NEXT: vpsllvd %ymm3, %ymm5, %ymm3 87; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3 88; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] 89; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 90; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 91; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 92; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 93; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 94; AVX512VL-NEXT: retq 95; 96; AVX512BW-LABEL: var_rotate_v32i16: 97; AVX512BW: # %bb.0: 98; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 99; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 100; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 101; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 102; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 103; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 104; AVX512BW-NEXT: retq 105; 106; AVX512VLBW-LABEL: var_rotate_v32i16: 107; AVX512VLBW: # %bb.0: 108; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 109; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 110; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 111; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 112; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 113; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 114; AVX512VLBW-NEXT: retq 115; 116; AVX512VBMI2-LABEL: var_rotate_v32i16: 117; AVX512VBMI2: # %bb.0: 118; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 119; AVX512VBMI2-NEXT: retq 120; 121; AVX512VLVBMI2-LABEL: var_rotate_v32i16: 122; AVX512VLVBMI2: # %bb.0: 123; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 124; AVX512VLVBMI2-NEXT: retq 125 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 126 %shl = shl <32 x i16> %a, %b 127 %lshr = lshr <32 x i16> %a, %b16 128 %or = or <32 x i16> %shl, %lshr 129 ret <32 x i16> %or 130} 131 132define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 133; AVX512F-LABEL: var_rotate_v64i8: 134; AVX512F: # %bb.0: 135; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 136; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 137; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 138; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] 139; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 140; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 141; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 142; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 143; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 144; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 145; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] 146; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4)) 147; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 148; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 149; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 150; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 151; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 152; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 153; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 154; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 155; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 156; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 157; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 158; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 159; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 160; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 161; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 162; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 163; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3)) 164; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 165; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 166; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 167; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 168; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 169; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 170; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 171; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 172; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 173; AVX512F-NEXT: retq 174; 175; AVX512VL-LABEL: var_rotate_v64i8: 176; AVX512VL: # %bb.0: 177; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 178; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 179; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 180; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] 181; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) 182; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 183; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 184; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 185; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 186; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 187; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] 188; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4)) 189; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 190; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 191; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 192; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 193; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 194; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8) 195; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 196; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 197; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 198; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 199; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3)) 200; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 201; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 202; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 203; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 204; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 205; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 206; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 207; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 208; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 209; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm3 & ymm8) 210; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 211; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 212; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 213; AVX512VL-NEXT: retq 214; 215; AVX512BW-LABEL: var_rotate_v64i8: 216; AVX512BW: # %bb.0: 217; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 218; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 219; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 220; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 221; AVX512BW-NEXT: vpsllvw %zmm3, %zmm4, %zmm3 222; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 223; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 224; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 225; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 226; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 227; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 228; AVX512BW-NEXT: retq 229; 230; AVX512VLBW-LABEL: var_rotate_v64i8: 231; AVX512VLBW: # %bb.0: 232; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 233; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 234; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 235; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 236; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm4, %zmm3 237; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 238; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 239; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 240; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 241; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 242; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 243; AVX512VLBW-NEXT: retq 244; 245; AVX512VBMI2-LABEL: var_rotate_v64i8: 246; AVX512VBMI2: # %bb.0: 247; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 248; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 249; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 250; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 251; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm4, %zmm3 252; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 253; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 254; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 255; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 256; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 257; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 258; AVX512VBMI2-NEXT: retq 259; 260; AVX512VLVBMI2-LABEL: var_rotate_v64i8: 261; AVX512VLVBMI2: # %bb.0: 262; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 263; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 264; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 265; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 266; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm4, %zmm3 267; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 268; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 269; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 270; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 271; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 272; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 273; AVX512VLVBMI2-NEXT: retq 274 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 275 %shl = shl <64 x i8> %a, %b 276 %lshr = lshr <64 x i8> %a, %b8 277 %or = or <64 x i8> %shl, %lshr 278 ret <64 x i8> %or 279} 280 281; 282; Uniform Variable Rotates 283; 284 285define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 286; AVX512-LABEL: splatvar_rotate_v8i64: 287; AVX512: # %bb.0: 288; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1 289; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 290; AVX512-NEXT: retq 291 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer 292 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat 293 %shl = shl <8 x i64> %a, %splat 294 %lshr = lshr <8 x i64> %a, %splat64 295 %or = or <8 x i64> %shl, %lshr 296 ret <8 x i64> %or 297} 298 299define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 300; AVX512-LABEL: splatvar_rotate_v16i32: 301; AVX512: # %bb.0: 302; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1 303; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 304; AVX512-NEXT: retq 305 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer 306 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat 307 %shl = shl <16 x i32> %a, %splat 308 %lshr = lshr <16 x i32> %a, %splat32 309 %or = or <16 x i32> %shl, %lshr 310 ret <16 x i32> %or 311} 312 313define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 314; AVX512F-LABEL: splatvar_rotate_v32i16: 315; AVX512F: # %bb.0: 316; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 317; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 318; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 319; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 320; AVX512F-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 321; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 322; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 323; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 324; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 325; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 326; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 327; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 328; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 329; AVX512F-NEXT: retq 330; 331; AVX512VL-LABEL: splatvar_rotate_v32i16: 332; AVX512VL: # %bb.0: 333; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 334; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 335; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 336; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 337; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 338; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 339; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 340; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 341; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 342; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 343; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 344; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 345; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 346; AVX512VL-NEXT: retq 347; 348; AVX512BW-LABEL: splatvar_rotate_v32i16: 349; AVX512BW: # %bb.0: 350; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 351; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 352; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 353; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 354; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 355; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 356; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 357; AVX512BW-NEXT: retq 358; 359; AVX512VLBW-LABEL: splatvar_rotate_v32i16: 360; AVX512VLBW: # %bb.0: 361; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 362; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 363; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 364; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 365; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 366; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 367; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 368; AVX512VLBW-NEXT: retq 369; 370; AVX512VBMI2-LABEL: splatvar_rotate_v32i16: 371; AVX512VBMI2: # %bb.0: 372; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 373; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 374; AVX512VBMI2-NEXT: retq 375; 376; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16: 377; AVX512VLVBMI2: # %bb.0: 378; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 379; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 380; AVX512VLVBMI2-NEXT: retq 381 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer 382 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 383 %shl = shl <32 x i16> %a, %splat 384 %lshr = lshr <32 x i16> %a, %splat16 385 %or = or <32 x i16> %shl, %lshr 386 ret <32 x i16> %or 387} 388 389define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 390; AVX512F-LABEL: splatvar_rotate_v64i8: 391; AVX512F: # %bb.0: 392; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 393; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 394; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 395; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm3 396; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 397; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 398; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 399; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 400; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 401; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 402; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm3 403; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 404; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 405; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 406; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 407; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 408; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 409; AVX512F-NEXT: retq 410; 411; AVX512VL-LABEL: splatvar_rotate_v64i8: 412; AVX512VL: # %bb.0: 413; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 414; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 415; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 416; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm3 417; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 418; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 419; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 420; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 421; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 422; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 423; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm3 424; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 425; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 426; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 427; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 428; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 429; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 430; AVX512VL-NEXT: retq 431; 432; AVX512BW-LABEL: splatvar_rotate_v64i8: 433; AVX512BW: # %bb.0: 434; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 435; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 436; AVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 437; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 438; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 439; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 440; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 441; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 442; AVX512BW-NEXT: retq 443; 444; AVX512VLBW-LABEL: splatvar_rotate_v64i8: 445; AVX512VLBW: # %bb.0: 446; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 447; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 448; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 449; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 450; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 451; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 452; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 453; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 454; AVX512VLBW-NEXT: retq 455; 456; AVX512VBMI2-LABEL: splatvar_rotate_v64i8: 457; AVX512VBMI2: # %bb.0: 458; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 459; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 460; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 461; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 462; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 463; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 464; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 465; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 466; AVX512VBMI2-NEXT: retq 467; 468; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8: 469; AVX512VLVBMI2: # %bb.0: 470; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 471; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 472; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 473; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 474; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 475; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 476; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 477; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 478; AVX512VLVBMI2-NEXT: retq 479 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer 480 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 481 %shl = shl <64 x i8> %a, %splat 482 %lshr = lshr <64 x i8> %a, %splat8 483 %or = or <64 x i8> %shl, %lshr 484 ret <64 x i8> %or 485} 486 487; 488; Constant Rotates 489; 490 491define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind { 492; AVX512-LABEL: constant_rotate_v8i64: 493; AVX512: # %bb.0: 494; AVX512-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 495; AVX512-NEXT: retq 496 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60> 497 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4> 498 %or = or <8 x i64> %shl, %lshr 499 ret <8 x i64> %or 500} 501 502define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind { 503; AVX512-LABEL: constant_rotate_v16i32: 504; AVX512: # %bb.0: 505; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 506; AVX512-NEXT: retq 507 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 508 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> 509 %or = or <16 x i32> %shl, %lshr 510 ret <16 x i32> %or 511} 512 513define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { 514; AVX512F-LABEL: constant_rotate_v32i16: 515; AVX512F: # %bb.0: 516; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 517; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 518; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 519; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 520; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 521; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 522; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 523; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 524; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 525; AVX512F-NEXT: retq 526; 527; AVX512VL-LABEL: constant_rotate_v32i16: 528; AVX512VL: # %bb.0: 529; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 530; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 531; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 532; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 533; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 534; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 535; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 536; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 537; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 538; AVX512VL-NEXT: retq 539; 540; AVX512BW-LABEL: constant_rotate_v32i16: 541; AVX512BW: # %bb.0: 542; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 543; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 544; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 545; AVX512BW-NEXT: retq 546; 547; AVX512VLBW-LABEL: constant_rotate_v32i16: 548; AVX512VLBW: # %bb.0: 549; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 550; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 551; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 552; AVX512VLBW-NEXT: retq 553; 554; AVX512VBMI2-LABEL: constant_rotate_v32i16: 555; AVX512VBMI2: # %bb.0: 556; AVX512VBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 557; AVX512VBMI2-NEXT: retq 558; 559; AVX512VLVBMI2-LABEL: constant_rotate_v32i16: 560; AVX512VLVBMI2: # %bb.0: 561; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 562; AVX512VLVBMI2-NEXT: retq 563 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 564 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> 565 %or = or <32 x i16> %shl, %lshr 566 ret <32 x i16> %or 567} 568 569define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { 570; AVX512F-LABEL: constant_rotate_v64i8: 571; AVX512F: # %bb.0: 572; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 573; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 574; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 575; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 576; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 577; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 578; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 579; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 580; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 581; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 582; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 583; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 584; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 585; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 586; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 587; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 588; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 589; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 590; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 591; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 592; AVX512F-NEXT: retq 593; 594; AVX512VL-LABEL: constant_rotate_v64i8: 595; AVX512VL: # %bb.0: 596; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 597; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 598; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 599; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] 600; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 601; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 602; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 603; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 604; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] 605; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 606; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 607; AVX512VL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 608; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 609; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 610; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 611; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 612; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 613; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 614; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 615; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 616; AVX512VL-NEXT: retq 617; 618; AVX512BW-LABEL: constant_rotate_v64i8: 619; AVX512BW: # %bb.0: 620; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 621; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 622; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 623; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 624; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 625; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 626; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 627; AVX512BW-NEXT: retq 628; 629; AVX512VLBW-LABEL: constant_rotate_v64i8: 630; AVX512VLBW: # %bb.0: 631; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 632; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 633; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 634; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 635; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 636; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 637; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 638; AVX512VLBW-NEXT: retq 639; 640; AVX512VBMI2-LABEL: constant_rotate_v64i8: 641; AVX512VBMI2: # %bb.0: 642; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 643; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 644; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 645; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 646; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 647; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 648; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 649; AVX512VBMI2-NEXT: retq 650; 651; AVX512VLVBMI2-LABEL: constant_rotate_v64i8: 652; AVX512VLVBMI2: # %bb.0: 653; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 654; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 655; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 656; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 657; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 658; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 659; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 660; AVX512VLVBMI2-NEXT: retq 661 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 662 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 663 %or = or <64 x i8> %shl, %lshr 664 ret <64 x i8> %or 665} 666 667; 668; Uniform Constant Rotates 669; 670 671define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind { 672; AVX512-LABEL: splatconstant_rotate_v8i64: 673; AVX512: # %bb.0: 674; AVX512-NEXT: vprolq $14, %zmm0, %zmm0 675; AVX512-NEXT: retq 676 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14> 677 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50> 678 %or = or <8 x i64> %shl, %lshr 679 ret <8 x i64> %or 680} 681 682define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { 683; AVX512-LABEL: splatconstant_rotate_v16i32: 684; AVX512: # %bb.0: 685; AVX512-NEXT: vprold $4, %zmm0, %zmm0 686; AVX512-NEXT: retq 687 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 688 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 689 %or = or <16 x i32> %shl, %lshr 690 ret <16 x i32> %or 691} 692 693define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { 694; AVX512F-LABEL: splatconstant_rotate_v32i16: 695; AVX512F: # %bb.0: 696; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1 697; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 698; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm3 699; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 700; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 701; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 702; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 703; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 704; AVX512F-NEXT: retq 705; 706; AVX512VL-LABEL: splatconstant_rotate_v32i16: 707; AVX512VL: # %bb.0: 708; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1 709; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 710; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm3 711; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 712; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 713; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2 714; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 715; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 716; AVX512VL-NEXT: retq 717; 718; AVX512BW-LABEL: splatconstant_rotate_v32i16: 719; AVX512BW: # %bb.0: 720; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm1 721; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 722; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 723; AVX512BW-NEXT: retq 724; 725; AVX512VLBW-LABEL: splatconstant_rotate_v32i16: 726; AVX512VLBW: # %bb.0: 727; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm1 728; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0 729; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 730; AVX512VLBW-NEXT: retq 731; 732; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16: 733; AVX512VBMI2: # %bb.0: 734; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 735; AVX512VBMI2-NEXT: retq 736; 737; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16: 738; AVX512VLVBMI2: # %bb.0: 739; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 740; AVX512VLVBMI2-NEXT: retq 741 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 742 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 743 %or = or <32 x i16> %shl, %lshr 744 ret <32 x i16> %or 745} 746 747define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { 748; AVX512F-LABEL: splatconstant_rotate_v64i8: 749; AVX512F: # %bb.0: 750; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 751; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 752; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 753; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 754; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 755; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 756; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 757; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 758; AVX512F-NEXT: retq 759; 760; AVX512VL-LABEL: splatconstant_rotate_v64i8: 761; AVX512VL: # %bb.0: 762; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 763; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 764; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 765; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 766; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 767; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 768; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 769; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 770; AVX512VL-NEXT: retq 771; 772; AVX512BW-LABEL: splatconstant_rotate_v64i8: 773; AVX512BW: # %bb.0: 774; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 775; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 776; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 777; AVX512BW-NEXT: retq 778; 779; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: 780; AVX512VLBW: # %bb.0: 781; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 782; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 783; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 784; AVX512VLBW-NEXT: retq 785; 786; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8: 787; AVX512VBMI2: # %bb.0: 788; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 789; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 790; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 791; AVX512VBMI2-NEXT: retq 792; 793; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8: 794; AVX512VLVBMI2: # %bb.0: 795; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 796; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 797; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 798; AVX512VLVBMI2-NEXT: retq 799 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 800 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 801 %or = or <64 x i8> %shl, %lshr 802 ret <64 x i8> %or 803} 804 805; 806; Masked Uniform Constant Rotates 807; 808 809define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { 810; AVX512-LABEL: splatconstant_rotate_mask_v8i64: 811; AVX512: # %bb.0: 812; AVX512-NEXT: vpsrlq $49, %zmm0, %zmm0 813; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 814; AVX512-NEXT: retq 815 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15> 816 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49> 817 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255> 818 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257> 819 %or = or <8 x i64> %lmask, %rmask 820 ret <8 x i64> %or 821} 822 823define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { 824; AVX512-LABEL: splatconstant_rotate_mask_v16i32: 825; AVX512: # %bb.0: 826; AVX512-NEXT: vprold $4, %zmm0, %zmm0 827; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 828; AVX512-NEXT: retq 829 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 830 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 831 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> 832 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> 833 %or = or <16 x i32> %lmask, %rmask 834 ret <16 x i32> %or 835} 836 837define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { 838; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: 839; AVX512F: # %bb.0: 840; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1 841; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 842; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3 843; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 844; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 845; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2 846; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 847; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) 848; AVX512F-NEXT: retq 849; 850; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: 851; AVX512VL: # %bb.0: 852; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 853; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 854; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3 855; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 856; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 857; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2 858; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 859; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) 860; AVX512VL-NEXT: retq 861; 862; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: 863; AVX512BW: # %bb.0: 864; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 865; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 866; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) 867; AVX512BW-NEXT: retq 868; 869; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: 870; AVX512VLBW: # %bb.0: 871; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 872; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 873; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1) 874; AVX512VLBW-NEXT: retq 875; 876; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16: 877; AVX512VBMI2: # %bb.0: 878; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 879; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 880; AVX512VBMI2-NEXT: retq 881; 882; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16: 883; AVX512VLVBMI2: # %bb.0: 884; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 885; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 886; AVX512VLVBMI2-NEXT: retq 887 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 888 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 889 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 890 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 891 %or = or <32 x i16> %lmask, %rmask 892 ret <32 x i16> %or 893} 894 895define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { 896; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: 897; AVX512F: # %bb.0: 898; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 899; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 900; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 901; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 902; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 903; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 904; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 905; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 906; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 907; AVX512F-NEXT: retq 908; 909; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: 910; AVX512VL: # %bb.0: 911; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 912; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 913; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 914; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 915; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 916; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 917; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 918; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 919; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 920; AVX512VL-NEXT: retq 921; 922; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: 923; AVX512BW: # %bb.0: 924; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 925; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 926; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 927; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 928; AVX512BW-NEXT: retq 929; 930; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: 931; AVX512VLBW: # %bb.0: 932; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 933; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 934; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 935; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 936; AVX512VLBW-NEXT: retq 937; 938; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8: 939; AVX512VBMI2: # %bb.0: 940; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 941; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 942; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 943; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 944; AVX512VBMI2-NEXT: retq 945; 946; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8: 947; AVX512VLVBMI2: # %bb.0: 948; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 949; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 950; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 951; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 952; AVX512VLVBMI2-NEXT: retq 953 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 954 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 955 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 956 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 957 %or = or <64 x i8> %lmask, %rmask 958 ret <64 x i8> %or 959} 960