1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512F 4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512BW 5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=AVX512VBMI 6 7define <64 x i8> @f1(ptr %p0) { 8; AVX2-LABEL: f1: 9; AVX2: # %bb.0: 10; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 11; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 12; AVX2-NEXT: vmovdqa (%rdi), %xmm2 13; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 14; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] 15; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 16; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] 17; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 18; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 19; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] 20; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 21; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 22; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] 23; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 24; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 25; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] 26; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 27; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 28; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] 29; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 30; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 31; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 32; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] 33; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 34; AVX2-NEXT: vmovdqa 112(%rdi), %xmm2 35; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 36; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 37; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 38; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 39; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 40; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 41; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 42; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 43; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 44; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 45; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 46; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 47; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] 48; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 49; AVX2-NEXT: retq 50; 51; AVX512F-LABEL: f1: 52; AVX512F: # %bb.0: 53; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0 54; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] 55; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 56; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm2 57; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] 58; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 59; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 60; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 61; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] 62; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 63; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm5 64; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] 65; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 66; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 67; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 68; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm5 69; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] 70; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] 71; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 72; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 73; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 74; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm0 75; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 76; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 77; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 78; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 79; AVX512F-NEXT: vmovdqa (%rdi), %xmm4 80; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5 81; AVX512F-NEXT: vpshufb %xmm1, %xmm5, %xmm1 82; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 83; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 84; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 85; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 86; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 87; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] 88; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 89; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 90; AVX512F-NEXT: retq 91; 92; AVX512BW-LABEL: f1: 93; AVX512BW: # %bb.0: 94; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 95; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] 96; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 97; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 98; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] 99; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 100; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 101; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2 102; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] 103; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 104; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm5 105; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] 106; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 107; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 108; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 109; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 110; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] 111; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] 112; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 113; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] 114; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 115; AVX512BW-NEXT: movl $2047, %eax # imm = 0x7FF 116; AVX512BW-NEXT: kmovd %eax, %k1 117; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} 118; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 119; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 120; AVX512BW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 121; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 122; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 123; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 124; AVX512BW-NEXT: movl $4192256, %eax # imm = 0x3FF800 125; AVX512BW-NEXT: kmovd %eax, %k1 126; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 {%k1} 127; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm1 128; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 129; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 130; AVX512BW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 131; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 132; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 133; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 134; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 135; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 136; AVX512BW-NEXT: retq 137; 138; AVX512VBMI-LABEL: f1: 139; AVX512VBMI: # %bb.0: 140; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 141; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 142; AVX512VBMI-NEXT: vpermi2b 64(%rdi), %zmm0, %zmm1 143; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125] 144; AVX512VBMI-NEXT: vpermi2b 128(%rdi), %zmm1, %zmm0 145; AVX512VBMI-NEXT: retq 146 %a0 = load <192 x i8>, ptr %p0 147 %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 1, i32 3, i32 7, i32 9, i32 13, i32 15, i32 19, i32 21, i32 25, i32 27, i32 31, i32 33, i32 37, i32 39, i32 43, i32 45, i32 49, i32 51, i32 55, i32 57, i32 61, i32 63, i32 67, i32 69, i32 73, i32 75, i32 79, i32 81, i32 85, i32 87, i32 91, i32 93, i32 97, i32 99, i32 103, i32 105, i32 109, i32 111, i32 115, i32 117, i32 121, i32 123, i32 127, i32 129, i32 133, i32 135, i32 139, i32 141, i32 145, i32 147, i32 151, i32 153, i32 157, i32 159, i32 163, i32 165, i32 169, i32 171, i32 175, i32 177, i32 181, i32 183, i32 187, i32 189> 148 ret <64 x i8> %r 149} 150 151define <64 x i8> @f2(ptr %p0) { 152; AVX2-LABEL: f2: 153; AVX2: # %bb.0: 154; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 155; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 156; AVX2-NEXT: vmovdqa (%rdi), %xmm2 157; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 158; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] 159; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 160; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] 161; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 162; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 163; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15] 164; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 165; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 166; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] 167; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 168; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 169; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] 170; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 171; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 172; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] 173; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 174; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 175; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 176; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 177; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 178; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 179; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 180; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4 181; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 182; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 183; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 184; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 185; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 186; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 187; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 188; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 189; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 190; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 191; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 192; AVX2-NEXT: retq 193; 194; AVX512F-LABEL: f2: 195; AVX512F: # %bb.0: 196; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 197; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] 198; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 199; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 200; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] 201; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 202; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 203; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 204; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 205; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 206; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] 207; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 208; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] 209; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 210; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 211; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 212; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 213; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] 214; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 215; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 216; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 217; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 218; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 219; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 220; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 221; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 222; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 223; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 224; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 225; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 226; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 227; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 228; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 229; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 230; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] 231; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 232; AVX512F-NEXT: retq 233; 234; AVX512BW-LABEL: f2: 235; AVX512BW: # %bb.0: 236; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm0 237; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] 238; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 239; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm2 240; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] 241; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 242; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 243; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 244; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 245; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 246; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] 247; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 248; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] 249; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 250; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 251; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 252; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 253; AVX512BW-NEXT: vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2 254; AVX512BW-NEXT: movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800 255; AVX512BW-NEXT: kmovq %rax, %k1 256; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,35,37,41,43,47,49,53,55,59,61,u,u,u,u,u,u,u,u,u,u,u] 257; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 258; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 259; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 260; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 261; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 262; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 263; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 264; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 265; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 266; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 267; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 268; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 269; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] 270; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 271; AVX512BW-NEXT: kmovq %rax, %k1 272; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 273; AVX512BW-NEXT: retq 274; 275; AVX512VBMI-LABEL: f2: 276; AVX512VBMI: # %bb.0: 277; AVX512VBMI-NEXT: vmovdqa64 64(%rdi), %zmm0 278; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 279; AVX512VBMI-NEXT: vpermi2b (%rdi), %zmm0, %zmm1 280; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127] 281; AVX512VBMI-NEXT: vpermi2b 128(%rdi), %zmm1, %zmm0 282; AVX512VBMI-NEXT: retq 283 %a0 = load <192 x i8>, ptr %p0 284 %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 1, i32 5, i32 7, i32 11, i32 13, i32 17, i32 19, i32 23, i32 25, i32 29, i32 31, i32 35, i32 37, i32 41, i32 43, i32 47, i32 49, i32 53, i32 55, i32 59, i32 61, i32 65, i32 67, i32 71, i32 73, i32 77, i32 79, i32 83, i32 85, i32 89, i32 91, i32 95, i32 97, i32 101, i32 103, i32 107, i32 109, i32 113, i32 115, i32 119, i32 121, i32 125, i32 127, i32 131, i32 133, i32 137, i32 139, i32 143, i32 145, i32 149, i32 151, i32 155, i32 157, i32 161, i32 163, i32 167, i32 169, i32 173, i32 175, i32 179, i32 181, i32 185, i32 187, i32 191> 285 ret <64 x i8> %r 286} 287 288define <64 x i8> @f3(ptr %p0) { 289; AVX2-LABEL: f3: 290; AVX2: # %bb.0: 291; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 292; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 293; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 294; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] 295; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 296; AVX2-NEXT: vmovdqa 80(%rdi), %xmm4 297; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] 298; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 299; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 300; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 301; AVX2-NEXT: vmovdqa (%rdi), %xmm4 302; AVX2-NEXT: vmovdqa 16(%rdi), %xmm6 303; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] 304; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 305; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] 306; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 307; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 308; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] 309; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 310; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 311; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7] 312; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 313; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm4 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 314; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 315; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2 316; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 317; AVX2-NEXT: vmovdqa 176(%rdi), %xmm3 318; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 319; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 320; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 321; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3 322; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 323; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 324; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5 325; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3 326; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 327; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] 328; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 329; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 330; AVX2-NEXT: retq 331; 332; AVX512F-LABEL: f3: 333; AVX512F: # %bb.0: 334; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm0 335; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] 336; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 337; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 338; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] 339; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 340; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 341; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 342; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 343; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] 344; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 345; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 346; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 347; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 348; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 349; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] 350; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 351; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm8 352; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] 353; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 354; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6 355; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7] 356; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 357; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 358; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 359; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 360; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 361; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 362; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 363; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 364; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm3 365; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 366; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 367; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 368; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 369; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 370; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 371; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 372; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm5, %ymm2 373; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 374; AVX512F-NEXT: retq 375; 376; AVX512BW-LABEL: f3: 377; AVX512BW: # %bb.0: 378; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 379; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] 380; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 381; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2 382; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] 383; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 384; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 385; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 386; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 387; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 388; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] 389; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 390; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] 391; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 392; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 393; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4 394; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] 395; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] 396; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 397; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] 398; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 399; AVX512BW-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 400; AVX512BW-NEXT: kmovd %eax, %k1 401; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} 402; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 403; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 404; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm4 405; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 406; AVX512BW-NEXT: vpor %xmm0, %xmm4, %xmm0 407; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm4 408; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 409; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm4 410; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 411; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 412; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 413; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 414; AVX512BW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 415; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} 416; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 417; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 418; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 419; AVX512BW-NEXT: retq 420; 421; AVX512VBMI-LABEL: f3: 422; AVX512VBMI: # %bb.0: 423; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 424; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 425; AVX512VBMI-NEXT: vpermi2b 64(%rdi), %zmm0, %zmm1 426; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,64,66,70,72,76,78,82,84,88,90,94,96,100,102,106,108,112,114,118,120,124,126] 427; AVX512VBMI-NEXT: vpermi2b 128(%rdi), %zmm1, %zmm0 428; AVX512VBMI-NEXT: retq 429 %a0 = load <192 x i8>, ptr %p0 430 %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 2, i32 4, i32 8, i32 10, i32 14, i32 16, i32 20, i32 22, i32 26, i32 28, i32 32, i32 34, i32 38, i32 40, i32 44, i32 46, i32 50, i32 52, i32 56, i32 58, i32 62, i32 64, i32 68, i32 70, i32 74, i32 76, i32 80, i32 82, i32 86, i32 88, i32 92, i32 94, i32 98, i32 100, i32 104, i32 106, i32 110, i32 112, i32 116, i32 118, i32 122, i32 124, i32 128, i32 130, i32 134, i32 136, i32 140, i32 142, i32 146, i32 148, i32 152, i32 154, i32 158, i32 160, i32 164, i32 166, i32 170, i32 172, i32 176, i32 178, i32 182, i32 184, i32 188, i32 190> 431 ret <64 x i8> %r 432} 433 434define <64 x i8> @f4(ptr %p0) { 435; AVX2-LABEL: f4: 436; AVX2: # %bb.0: 437; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 438; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 439; AVX2-NEXT: vmovdqa (%rdi), %xmm2 440; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 441; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] 442; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 443; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] 444; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 445; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 446; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14] 447; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 448; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 449; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] 450; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 451; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 452; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] 453; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 454; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 455; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] 456; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 457; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 458; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 459; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 460; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 461; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 462; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 463; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4 464; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 465; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 466; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 467; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 468; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 469; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 470; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 471; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 472; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 473; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 474; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 475; AVX2-NEXT: retq 476; 477; AVX512F-LABEL: f4: 478; AVX512F: # %bb.0: 479; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 480; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] 481; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 482; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 483; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] 484; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 485; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 486; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 487; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 488; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 489; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] 490; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 491; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] 492; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 493; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 494; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 495; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 496; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] 497; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 498; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 499; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 500; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 501; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 502; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 503; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 504; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 505; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 506; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 507; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 508; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 509; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 510; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 511; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 512; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 513; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] 514; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 515; AVX512F-NEXT: retq 516; 517; AVX512BW-LABEL: f4: 518; AVX512BW: # %bb.0: 519; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm0 520; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] 521; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 522; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm2 523; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] 524; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 525; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 526; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 527; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 528; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 529; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] 530; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 531; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] 532; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 533; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 534; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 535; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 536; AVX512BW-NEXT: vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2 537; AVX512BW-NEXT: movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800 538; AVX512BW-NEXT: kmovq %rax, %k1 539; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,34,36,40,42,46,48,52,54,58,60,u,u,u,u,u,u,u,u,u,u,u] 540; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 541; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 542; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 543; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 544; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 545; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 546; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 547; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 548; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 549; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 550; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 551; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 552; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] 553; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 554; AVX512BW-NEXT: kmovq %rax, %k1 555; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 556; AVX512BW-NEXT: retq 557; 558; AVX512VBMI-LABEL: f4: 559; AVX512VBMI: # %bb.0: 560; AVX512VBMI-NEXT: vmovdqa64 64(%rdi), %zmm0 561; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 562; AVX512VBMI-NEXT: vpermi2b (%rdi), %zmm0, %zmm1 563; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,66,68,72,74,78,80,84,86,90,92,96,98,102,104,108,110,114,116,120,122,126] 564; AVX512VBMI-NEXT: vpermi2b 128(%rdi), %zmm1, %zmm0 565; AVX512VBMI-NEXT: retq 566 %a0 = load <192 x i8>, ptr %p0 567 %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 0, i32 4, i32 6, i32 10, i32 12, i32 16, i32 18, i32 22, i32 24, i32 28, i32 30, i32 34, i32 36, i32 40, i32 42, i32 46, i32 48, i32 52, i32 54, i32 58, i32 60, i32 64, i32 66, i32 70, i32 72, i32 76, i32 78, i32 82, i32 84, i32 88, i32 90, i32 94, i32 96, i32 100, i32 102, i32 106, i32 108, i32 112, i32 114, i32 118, i32 120, i32 124, i32 126, i32 130, i32 132, i32 136, i32 138, i32 142, i32 144, i32 148, i32 150, i32 154, i32 156, i32 160, i32 162, i32 166, i32 168, i32 172, i32 174, i32 178, i32 180, i32 184, i32 186, i32 190> 568 ret <64 x i8> %r 569} 570